def parse(city_url): # 解析函数 response = requests.get(city_url) response.encoding = 'utf-8' html = etree.HTML(response.text) current_city = html.xpath("//div[@class='search_default']/em/text()")[ 0] # 下面都是利用xpath解析的 print('当前城市:' + current_city) current_kongqi = html.xpath( "//div[@class='left']/div[@class='wea_alert clearfix']/ul/li/a/em/text()" )[0] print('空气质量:' + current_kongqi) current_wendu = html.xpath( "//div[@class='left']/div[@class='wea_weather clearfix']/em/text()" )[0] print('当前温度:' + current_wendu + '℃') current_weather = html.xpath( "//div[@class='wea_weather clearfix']/b/text()")[0] print('天气状况:' + current_weather) current_shidu = html.xpath( "//div[@class='left']/div[@class='wea_about clearfix']/span/text()" )[0] print('当前湿度:' + current_shidu) current_fengji = html.xpath( "//div[@class='left']/div[@class='wea_about clearfix']/em/text()" )[0] print('当前风速:' + current_fengji) jingdian = html.xpath( "//div[@class='right']/div[@class='near'][2]/div[@class='item clearfix']/ul/li/a/text()" ) print('附近景点:') for j in jingdian: print('\t\t' + j) # return current_city,current_kongqi,current_wendu,current_weather,current_shidu,current_fengji # def rewrite(city,kongqi,wendu,weather,shidu,fengji): # # if not os.path.exists(weather.csv): #判断当前路径是否存在,没有则创建new文件夹 # # else: # with open('te8578978888888888st.txt', 'w') as f: # f.write('hello, python') # print("数据写入完成") with open(path + "\\" + city_name + 'weather.csv', 'a') as f: # f.write(current_city ,current_shidu) data1 = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) time1 = time.strftime('%H:%M:%S', time.localtime(time.time())) writer = csv.writer(f) #先写入columns_name #writer.writerow(["index","csv_1","csv_2"]) #写入多行用writerows try: #f.write('hello, python') writer.writerows([[ data1, current_city[:3], current_wendu, current_shidu, current_weather, current_fengji ]]) print("csv数据写入完成\n{},{}{}{}{}{}{}".format( data1, time1, current_city[:3], current_wendu, current_shidu, current_weather, current_fengji)) print('等待正在运行....') except: print('数据保存错误') #writer.writerows([[data1,time1,current_city[:3],current_wendu,current_shidu,current_weather,current_fengji]]) f.close()
def parseHtml(file): print("解析HTML") html = etree.HTML(file) tag3 = html.xpath('/html/tr/td[1]/text()') print(tag3)
def parse_html(html, xpath): element = etree.HTML(html) return element.xpath(xpath)
import urllib3 import csv import requests from collections import namedtuple from lxml.html import etree TWSE_EQUITIES_URL = 'http://isin.twse.com.tw/isin/C_public.jsp?strMode=2' TPEX_EQUITIES_URL = 'http://isin.twse.com.tw/isin/C_public.jsp?strMode=4' ROW = namedtuple( 'Row', ['type', 'code', 'name', 'ISIN', 'start', 'market', 'group', 'CFI']) def fetch_data(url): r = requests.get(url) return r if __name__ == '__main__': data = fetch_data(TWSE_EQUITIES_URL) root = etree.HTML(data.text) trs = root.xpath('//tr')[1:]
def parse(self, current_city_url, html, city_name): file_name = f'租房{self.today_str}/{city_name}{self.today_str}房天下租房.xlsx' if not os.path.exists(file_name): wb = openpyxl.Workbook() ws = wb.worksheets[0] self.save_to_excel(ws, 0, self.excel_head) wb.save(file_name) wb = openpyxl.load_workbook(file_name) ws = wb.worksheets[0] next_url = True row_count = 1 while next_url: html_eles = etree.HTML(html) # 获取下一页 next_url = html_eles.xpath('//a[text()="下一页"]/@href') next_url = current_city_url + next_url[0][1:] if next_url else None # 获取网页houseList类所有租房信息 house_eles = html_eles.xpath('//div[@class="houseList"]/dl') # 遍历每个房子获取租房信息 for house_ele in house_eles: # 获取房子id(用于去重) house_id = house_ele.xpath('./dd/p[1]/a/@href') if house_id: # 图片数量不存在说明,是广告,不做处理 try: house_id = house_id[0].split('/')[-1].split('.')[ 0] # 简化id # 图片数量 tupian = house_ele.xpath( './/span[@class="iconImg"]/text()')[0] # 价格 price = house_ele.xpath( './/span[@class="price"]/text()')[0] # renttype/shiting/mianji/chaoxiang main_info = [ re.sub('\r|\n| |', '', field).replace('�O', '㎡') for field in house_ele.xpath('./dd/p[2]//text()') if field != '|' ] if len(main_info) != 4: continue renttype = main_info[0] shiting = main_info[1] mianji = main_info[2] chaoxiang = main_info[3] # 辖区、 街道、小区名 position_info = [ field for field in house_ele.xpath( './dd/p[3]/a/span/text()') ] if len(position_info) != 3: continue xiaqu = position_info[0] jiedao = position_info[1] xiaoqu = position_info[2] jiaotong = ''.join( house_ele.xpath( './/span[@class="note subInfor"]//text()')) jiaotong = jiaotong if jiaotong else '无' except: pass else: if row_count > 3000: wb.save(file_name) return if house_id not in self.quchong[city_name]: # print(house_id, tupian, price, renttype, shiting, mianji, chaoxiang, xiaqu, jiedao, xiaoqu, jiaotong) print(f'正在爬取:{city_name}-->第{row_count}条租房信息', ) # 保存数据 self.save_to_excel(ws, row_count, [ self.today_str, city_name, tupian, price, renttype, shiting, mianji, chaoxiang, xiaqu, jiedao, xiaoqu, jiaotong, ]) row_count += 1 self.quchong[city_name].append( house_id) # 将爬取过的房子id放进去,用于去重 else: print('已存在') if next_url: html = self.get_html(next_url) wb.save(file_name)
def query_title(lession_id): url = "https://edu.51cto.com/center/course/lesson/index?type=wejob&id=" + lession_id html = etree.HTML(requests.get(url, headers=headers).text) title = html.xpath('//title')[0].text.replace(" ", "-")[:-8] return title
def get_parse(url): content = requests.get(url, headers=useragentutil.get_headers()).content parse = etree.HTML(content) return parse
def url_to_md_txt_hexo(url, tap): try: url = f'https://www.cnblogs.com/{url}' response = requests.get(url) # print(response.text) a = re.findall( '<div id="cnblogs_post_body" class="blogpost-body.*?">(.*?)<div id="MySignature"></div>', response.text, re.S) if not a: response_dome = BeautifulSoup(response.text, 'html.parser') response_dome_str = str(response_dome.div) a = re.findall( '<div class="postBody">(.*?)<div id="MySignature"></div>', response_dome_str, re.S) a = a[0] #去除a头尾的空格 a = a.strip() #去除末尾的div a = a[:-6] #再去除一次宫格 a = a.strip() # 标题 a = re.sub('<h1>.*?\d*\. (?P<name>.*?)</h1>', '<h1>\g<name>\n\n</h1>', a) a = re.sub('<h1.*?>', '# ', a) a = re.sub('<h2>.*?\d*\.\d* (?P<name>.*?)</h2>', '<h2>\g<name>\n\n</h2>', a) a = re.sub('<h2.*?>', '## ', a) a = re.sub('<h3>.*?\d*\.\d*\.\d* (?P<name>.*?)</h3>', '<h3>\g<name>\n\n</h3>', a) a = re.sub('<h3.*?>', '### ', a) a = re.sub('<h4>.*?\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h4>', '<h4>\g<name>\n\n</h4>', a) a = re.sub('<h4.*?>', '#### ', a) a = re.sub('<h5>.*?\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h5>', '<h5>\g<name>\n\n</h5>', a) a = re.sub('<h5.*?>', '##### ', a) a = re.sub('<h6>.*?\d*\.\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h6>', '<h6>\g<name>\n\n</h6>', a) a = re.sub('<h6.*?>', '###### ', a) a = re.sub('</h1>|</h2>|</h3>|</h4>|</h5>|</h6>|', "", a) # print(a) # 三个点 if '<pre class=' in a: a = re.sub('<pre class="', '```', a) a = re.sub('"><code>', '\n', a) a = re.sub('<pre><code.*?>', '```\n', a) a = re.sub('</code></pre>', '\n```', a) #另外一个写法的a a = re.sub('<div class="cnblogs_code".*?>', '```python', a) a = re.sub('</div>', '```', a) #一个点 a = re.sub('<code.*?>|</code>', '`', a) # 标签 # 去掉开头的div标签 a = re.sub('<div.*?>', '', a) # em标签 a = re.sub('<em.*?>|</em>', ' ', a) # strong标签加粗 a = re.sub('<strong>|</strong>', '**', a) # span标签 a = re.sub('<span.*?>|</span>', '', a) # pre标签 a = re.sub('<pre.*?>|</pre>', '', a) # p标签 a = re.sub('<p.*?>|</p>', '', a) # br标签 a = re.sub('<br/>', '\n', a) # 里面内容特殊变化 # 双引号 a = re.sub('"', '"', a) # 单引号 a = re.sub(''', "'", a) # >符号 a = re.sub('>', '>', a) # 符号 a = re.sub('<', '<', a) #ul与li a = re.sub('<ul.*?>|</ul>|</li>', '', a) a = re.sub('<li.*?>', '- ', a) #html标签修正 print(a) a = re.sub('<;', '<', a) a = re.sub('>;', '>', a) a = re.sub(';/', '/', a) # 上面全是转md # 上面全是转md #添加头 title_xpath = '//a[@id="cb_post_title_url"]/text()' response_html = etree.HTML(response.text) title = response_html.xpath(title_xpath)[0] data_xpath = '//*[@id="post-date"]/text()' data = response_html.xpath(data_xpath)[0] data_header = f'---\ntitle: {title} \ndate: {data} \ntags: {tap} \n\n\n---\n' a = data_header + a return a # 可能博客不一样会存在见状性没有用我匹配的格式找到内容 except: print('on') return False
# {"http": "111.155.116.220:8123"}, # {"https": "58.19.63.57:18118" }, # {"https": "183.159.85.234:3128" }, # {"https": "223.240.208.151:18118"}, # {"http": "117.68.193.19:18118" }, ] httpCol={} # 有效代理IP池 proxypool = [] #http://www.66ip.cn/ proxy_urls = ['http://www.ip3366.net/?stype=1&page={}'.format(n) for n in range(1,11)] for proxy_url in proxy_urls: #print(proxy_url) r = requests.get(url=proxy_url) if r.status_code==200: html = etree.HTML(r.text) selectors = html.xpath('//*[@id="list"]/table/tbody/tr') #print(selectors) for row in selectors: host = row.xpath('td[1]/text()')[0] port = row.xpath('td[2]/text()')[0] httpCol['http']=host+":"+port proxies.append(httpCol) httpCol={} r.close() #print(proxies) f = open("EffectiveIp.json", 'w') f.write('[')
def make_data(mode, url=""): """ :param mode: 模式,支持的模式有share like1 like2 tag :param url: 生成data需要用到url,share like1 需要的是用户主页的url,tag需要的是tag页的url。like2不会用到,因为信息在cookies种 :return: 初始data """ if (mode == "like1" or mode == "share" or mode == "tag") and url == "": print("{}模式生成data需要url参数".format(mode)) return {} base_data = {'callCount': '1', 'httpSessionId': '', 'scriptSessionId': '${scriptSessionId}187', 'c0-id': '0', "batchId": "472351"} get_num = 100 got_num = 0 if mode == "share" or mode == "like1": userId = "" user_page_parse = etree.HTML( requests.get(url, headers=useragentutil.get_headers()).content.decode("utf-8")) try: userId = user_page_parse.xpath("//body/iframe[@id='control_frame']/@src")[0].split("blogId=")[1] except: print("\n链接与模式不匹配") exit() data_parme = { 'c0-scriptName': 'BlogBean', "c0-methodName": "", 'c0-param0': 'number:' + str(userId), 'c0-param1': 'number:' + str(get_num), 'c0-param2': 'number:' + str(got_num), 'c0-param3': 'string:'} if mode == "like1": data_parme["c0-methodName"] = "queryLikePosts" else: data_parme["c0-methodName"] = "querySharePosts" elif mode == "like2": data_parme = {"c0-scriptName": "PostBean", "c0-methodName": "getFavTrackItem", "c0-param0": "number:" + str(get_num), "c0-param1": "number:" + str(got_num), } elif mode == "tag": # 参数8要拿时间戳 url_search = re.search("http[s]{0,1}://www.lofter.com/tag/(.*?)/(.*)", url) type = url_search.group(2) if type == "": type = "new" data_parme = {'c0-scriptName': 'TagBean', 'c0-methodName': 'search', 'c0-param0': 'string:' + url_search.group(1), 'c0-param1': 'number:0', 'c0-param2': 'string:', 'c0-param3': 'string:' + type, 'c0-param4': 'boolean:false', 'c0-param5': 'number:0', 'c0-param6': 'number:' + str(get_num), 'c0-param7': 'number:' + str(got_num), 'c0-param8': 'number:' + str(int(time.time() * 1000)), 'batchId': '870178'} else: print("data-模式错误") data_parme = {} data = {**base_data, **data_parme} return data
def fetch_questions(url, chapter_id): """ 根据知识点来提取题目 :url string 知识点url :chapter_id 知识点ID """ print("题目提取开始:{0} {1}".format(chapter_id, url)) # 只提取 https://m10.bjzjxf.com/Home/Index/qaq/1985 这一类的题目 if url.find("qaq") == -1: print("非题目页面,跳过") return if redis.exists(PROCESSED_URLS) and redis.sismember(PROCESSED_URLS, url): print("题目已处理,跳过") return res = http_request(url) html = etree.HTML(res) # 正则匹配题目数量 try: number = html.xpath("//div[@id='1']/text()")[0] number = int(re.findall(r"共(\d+)题", number)[0]) except IndexError: redis.sadd(PROCESSED_URLS, url) print("此知识点没有题目") return items = [] qhtml = html.xpath("//div[@class='dati']")[0] contents = qhtml.xpath("./b/text()") # 题目正文 selects = qhtml.xpath("./ul") # 题目选项 answers = qhtml.xpath("./div[@class='answer']") # 答案选项 if len(selects) != number or len(contents) != number or len( answers) != number: redis.sadd(PROCESSED_URLS, url) print("此知识点内容有误,请手动处理") redis.sadd("need_handle_urls", url) return for i in range(0, number): # 提取选项 select_list = selects[i].xpath("./li/text()") select_list = split_array(select_list, 2) select_list = [ " ".join(x).replace("\u2003\u2002", "") for x in select_list ] # 提取答案 answer_text = list( filter(lambda x: x != "您选择:", answers[i].xpath(".//text()"))) item = { "title": qhtml.xpath("./div[@id={0}]/text()".format(i + 1)), "select": "\n".join(select_list), "content": contents[i], "answer": answer_text[0] + answer_text[1] + "\n".join(answer_text[2:]), "order": i + 1, } items.append(item) # 插入题目 for item in items: if isinstance(item["title"], list): item["title"] = item["title"].pop() row = db.select_one( "SELECT * FROM `tk_questions` WHERE `chapter_id`=%s and `title`=%s", (chapter_id, item["title"])) if row is not None: print("{0} 题目已存在,无法插入!".format(item["title"])) continue sql = """ INSERT INTO `tk_questions` ( `chapter_id`, `title`, `content`, `select`, `answer`, `order`) VALUES ( %s, %s, %s, %s, %s, %s) """ data = (chapter_id, item["title"], item["content"], item["select"], item["answer"], item["order"]) db.insert(sql, data) redis.sadd(PROCESSED_URLS, url) print("此知识点题目提取完毕")
def infor_formater(favs_info, fav_str, mode, file_path, start_time, min_hot, print_level): # 把字段从原文件中提取出来,大部分使用正则 format_fav_info = [] start_time_stamp = "" if start_time: start_time_stamp = time.mktime(time.strptime(start_time, "%Y-%m-%d")) for fav_info in favs_info: blog_info = {} # 博客链接 try: url = re.search('s\d{1,5}.blogPageUrl="(.*?)"', fav_info).group(1) except: print("博客{} 信息丢失,跳过".format(favs_info.index(fav_info) + 1)) continue blog_info["url"] = url if print_level: print("博客{} {}准备解析".format(favs_info.index(fav_info) + 1, url), end="\t") # 喜欢时间 fav_timestamp = re.search('s\d{1,5}.opTime=(.*?);', fav_info).group(1) # 模式为like2且早于设定时间则跳出整理 if mode == "like2" and start_time: if int(fav_timestamp) / 1000 < start_time_stamp: print("已将指定时间内的博客解析结束") break blog_hot = int(re.search('s\d{1,5}.hot=(.*?);', fav_info).group(1)) if mode == "tag" and blog_hot < min_hot: print("当前博客的热度小于设定热度,跳过") continue time_local2 = time.localtime(int(int(fav_timestamp) / 1000)) fav_time = time.strftime("%Y-%m-%d", time_local2) blog_info["fav time"] = fav_time # 作者名 author_name_search = re.search('s\d{1,5}.blogNickName="(.*?)"', fav_info) if author_name_search: author_name = author_name_search.group(1).encode('latin-1').decode('unicode_escape', errors="replace") # 正则没有匹配出来的话说明这一页的前面也有这个作者的博客,作者信息在前面,找到id再在前面搜索作者信息 else: info_id = re.search("s\d{1,5}.blogInfo=(s\d{1,5})", fav_info).group(1) test_names = re.findall(info_id + '.blogNickName="(.*?)"', fav_str.split('blogPageUrl="' + url + '"')[0]) author_name = test_names[-1].encode('latin-1').decode('unicode_escape', errors="replace") blog_info["author name"] = author_name # 文件中不允许出现的字符,在用于文件名时要全部替换掉,英文括号换成中文括号,避免在检查文件名重复时被切割 author_name_in_filename = author_name.replace("/", "&").replace("|", "&").replace("\r", " ").replace( "\\", "&").replace("<", "《").replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?") \ .replace("*", "·").replace("\n", "").replace("(", "(").replace(")", ")").strip() blog_info["author name in filename"] = author_name_in_filename # 作者ip author_ip = re.search("http[s]{0,1}://(.*?).lofter.com", url).group(1) blog_info["author ip"] = author_ip # 发表时间 public_timestamp = re.search('s\d{1,5}.publishTime=(.*?);', fav_info).group(1) time_local1 = time.localtime(int(int(public_timestamp) / 1000)) public_time = time.strftime("%Y-%m-%d", time_local1) blog_info["public time"] = public_time # tags tags = re.search('s\d{1,5}.tag[s]{0,1}="(.*?)";', fav_info).group(1).strip().encode('utf-8').decode( 'unicode_escape').split(",") if tags[0] == "": tags = [] lower_tags = [] for tag in tags: # 转小写,全角空格转半角 lower_tag = tag.lower().replace(" ", " ").strip() lower_tags.append(lower_tag) blog_info["tags"] = lower_tags # 标题 try: title = re.search('s\d{1,5}.title="(.*?)"', fav_info).group(1).encode('latin-1').decode('unicode_escape', errors="ignore ") except: title = "" title_in_filename = title.replace("/", "&").replace("|", "&").replace("\r", " ").replace( "\\", "&") \ .replace("<", "《").replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?") \ .replace("*", "·").replace("\n", "").replace("(", "(").replace(")", ")").strip() blog_info["title"] = title blog_info["title in filename"] = title_in_filename # 图片链接 img_urls = [] urls_search = re.search('originPhotoLinks="(\[.*?\])"', fav_info) if urls_search: urls_str = urls_search.group(1).replace("\\", "").replace("false", "False").replace("true", "True") urls_infos = eval(urls_str) for url_info in urls_infos: # raw是没有任何后缀的原图,但有的没有raw,取orign try: url = url_info["raw"] except: url = url_info["orign"].split("?imageView")[0] if "netease" in url: url = url_info["orign"].split("?imageView")[0] img_urls.append(url) blog_info["img urls"] = img_urls # 正文内容 tmp_content1 = re.search('s\d{1,5}.content="(.*?)";', fav_info).group(1) parse = etree.HTML(tmp_content1) # if tmp_content1: # f = parse.xpath("//p//text()") # tmp_content2 = "\n".join(f) # content = tmp_content2.encode('latin-1').decode("unicode_escape", errors="ignore").strip() # else: # content = "" # blog_info["content"] = content content = html2text.html2text(tmp_content1.encode('latin-1').decode("unicode_escape", errors="ignore")) blog_info["content"] = content # 文章中插的图片 illustration = [] if tmp_content1: # 匹配新格式 img_src = parse.xpath("//img/@src") illustration = re.findall('"(http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*?)\?', "\n".join(img_src)) if illustration == []: # 匹配旧格式 illustration = re.findall('"(http[s]{0,1}://imglf\d{0,1}.nosdn\d*.[0-9]{0,3}.net.*?)\?', "\n".join(img_src)) blog_info["illustration"] = illustration # 外链 if tmp_content1: link_a = parse.xpath("//a/@href") external_link = list(map(lambda x: x.replace("\\", "").replace('"', ''), link_a)) else: external_link = [] blog_info["external link"] = external_link # 长文章 l_content = "" l_cover = "" l_url = [] l_img = [] long_article = re.search('s\d{1,5}.compositeContent="(.*?)";s\d{1,5}', fav_info) try: if long_article: long_article1 = long_article.group(1) parse = etree.HTML(long_article.group(1)) l_cover = re.search('s\d{1,5}.banner="(.*?)";', fav_info).group(1) l_url = parse.xpath("//a//@href") l_url = list(map(lambda x: x.replace("\\", "").replace('"', ''), l_url)) l_img = parse.xpath("//img/@src") l_img = list(map(lambda x: x.replace("\\", "").replace('"', ''), l_img)) l_content = c = re.sub('<[^<]+?>', '', long_article1).replace(" ", " ").strip() l_content = l_content.encode('latin-1').decode("unicode_escape", errors="ignore").strip() except: # print("长文章 {} 被屏蔽,无法获取正文".format(url)) pass blog_info["long article content"] = l_content blog_info["long article url"] = l_url blog_info["long article img"] = l_img blog_info["long article cover"] = l_cover # video_url_search = re.findall('"originUrl":""') # 整合后输出 format_fav_info.append(blog_info) if print_level: print("解析完成,具体信息:\n{}".format(blog_info)) print("----" * 20) else: if favs_info.index(fav_info) % 100 == 0 or len(format_fav_info) == len(favs_info): print("解析进度 {}/{} 正在解析的博客链接 {}".format(len(format_fav_info), len(favs_info), blog_info["url"])) # 写入到文件 with open(file_path + "/format_blogs_info.json", "w", encoding="utf-8", errors="ignore") as op: op.write(json.dumps(format_fav_info, ensure_ascii=False, indent=4))
def parse(self, url): ret = requests.get(url, headers=self.headers) html = etree.HTML(ret.content.decode()) page_count = html.xpath( '//*[@id="J_bottomPage"]/span[2]/em[1]/b/text()')[0] count = int(page_count) + 1 for i in range(1, count): print("开始爬取数据") data_url = url + '&page=' + str(i) data = requests.get(data_url, headers=self.headers) html = etree.HTML(data.content.decode()) item_lis = html.xpath('//*[@id="plist"]/ul/li') data_list = [] for i, item in enumerate(item_lis): dic = {} image = item.xpath('./div/div[1]/a/img/@src') desc = item.xpath("./div/div[4]/a/em/text()")[0].strip() data_sku = item.xpath('./div/@data-sku')[0] venderid = item.xpath('./div/@venderid')[0] get_sku_url = "https://p.3.cn/prices/mgets?skuIds=J_{}".format( data_sku) get_store_url = "https://rms.shop.jd.com/json/pop/shopInfo.action?ids={}".format( venderid) price_dic = requests.get(get_sku_url, headers=self.headers) stort_dic = requests.get(get_store_url, headers=self.headers) price = json.loads(price_dic.content.decode())[0]['p'] store = json.loads( stort_dic.content.decode(encoding='GBK'))[0]['name'] detail_url = item.xpath('./div/div[1]/a/@href')[0] detail_url = "https:{}".format(detail_url) detail_data = requests.get(detail_url, headers=self.headers) html = etree.HTML(detail_data.text) color = html.xpath( '//*[@id="choose-attr-1"]/div/div/@data-value') version = html.xpath( '//*[@id="choose-attr-2"]/div/div/@data-value') w_url = "https://c0.3.cn/stock?skuId={}&area=15_1243_3419_0&venderId={}&choseSuitSkuIds=&cat=9987,653,655".format( data_sku, venderid) d = requests.get(w_url, headers=self.headers) weight = json.loads( d.content.decode("GBK"))['stock'].get("weightValue") dic['id'] = data_sku dic['image'] = image dic['price'] = price dic['description'] = desc dic['store'] = store dic['url'] = detail_url dic['color'] = color dic['version'] = version dic['weight'] = weight data_list.append(dic) print('商品{}爬取完成'.format(data_sku)) print("当前爬取的url是:", data_url) self.save_to_mongo(data_list)
def page_istrue(self, response): selector = etree.HTML(response) if selector.xpath('//div[@class="list3 clearfix"]'): return True else: return False
socket.setdefaulttimeout(5) req = request.Request(url, headers=headers) try: page = request.urlopen(req).read() except : page = '' return page if __name__ == '__main__': #分析网页获取数据 url='https://finance.sina.com.cn/money/forex/hq/USDCNY.shtml' page = get_url(url) print(page) tree=etree.HTML(page) #使用xpath来解析十大流通股东 stocktitle= tree.xpath(u"/*[@id='hotHorex']") print(stocktitle) #title =stocktitle[].text #getdate=title[title.find('(')+1:title.find(')')] #param=[] #nodes=tree.xpath(u"/html/body/div[9]/div[32]/table") #for node in nodes: # for data in node: # stockhold=[] # for listdata in data:
job_info = {} f = open("20200711-II.json", 'w', encoding='utf-8') headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50' } localtime = time.asctime(time.localtime(time.time())) for i in range(0, 9): url = 'https://www.liepin.com/zhaopin/?compkind=&dqs=250&pubTime=&pageSize=40&salary=&compTag=&sortFlag=15°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&siTag=bFGQTbwE_AAQSb-u11jrBw%7EE08QNgJtmOV680BaDaEpHQ&d_sfrom=search_prime&d_ckId=cacf3d164385361dba08f1766c63a3a1&d_curPage=' + str( i + 1 ) + '&d_pageSize=40&d_headId=cacf3d164385361dba08f1766c63a3a1&curPage=' + str( i) #https://www.liepin.com/zhaopin/?compkind=&dqs=250&pubTime=&pageSize=40&salary=&compTag=&sortFlag=15°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&siTag=bFGQTbwE_AAQSb-u11jrBw%7EE08QNgJtmOV680BaDaEpHQ&d_sfrom=search_prime&d_ckId=cacf3d164385361dba08f1766c63a3a1&d_curPage=0&d_pageSize=40&d_headId=cacf3d164385361dba08f1766c63a3a1&curPage=1 response = r.Request(url=url, headers=headers) data = r.urlopen(response).read().decode("utf-8") data1 = etree.HTML(data) for j in range(40): #data2=data1.xpath("//div[@class='job-info']//a[@target='_blank']/@href".format(j)) #print(data2) #for I in data2: try: job_title = data1.xpath("//div[@class='job-info']/h3/a/text()")[j] except: job_title = [] try: job_title_url = data1.xpath( "//div[@class='company-info nohover']/p//a[@target='_blank']/@href" )[j] except: job_title_url = []
def run(author_url, start_time, end_time, target_tags, tags_filter_mode, file_update_interval): author_page_parse = etree.HTML( requests.get( author_url, headers=useragentutil.get_headers()).content.decode("utf-8")) # id是是获取归档页面需要的一个参数,纯数字;ip是作者在lofter的三级域名,由作者注册时设定 author_id = author_page_parse.xpath( "//body/iframe[@id='control_frame']/@src")[0].split("blogId=")[1] author_ip = re.search(r"http[s]*://(.*).lofter.com/", author_url).group(1) try: author_name = author_page_parse.xpath("//title//text()")[0] except: author_name = input("解析作者名时出现异常,请手动输入\n") archive_url = author_url + "dwr/call/plaincall/ArchiveBean.getArchivePostByTime.dwr" query_num = 50 data = make_data(author_id, query_num) head = make_head(author_url) try: print("作者名%s,lofter ip %s,主页链接 %s" % (author_name, author_ip, author_url)) except: print("作者名中有异常符号,无法显示,lofter ip %s,主页链接 %s" % (author_ip, author_url)) deal_file("init") dir_path = "./dir/author_img_file" # 判断博客解析进度 if is_file_in(dir_path + "/blogs_info.json") == "finished": print("所有博客已解析完毕,跳转至图片下载") elif is_file_in(dir_path + "/blogs_info.json"): blogs_info = get_file_contetn(dir_path + "/blogs_info.json") parsed_blogs_info = get_file_contetn(dir_path + "/blogs_info_parsed.json") print("读取到上次运行保存的博客信息:未解析博链接%d条,已解析链接%d条,接上次继续运行" % (len(blogs_info), len(parsed_blogs_info))) parse_blogs_info(blogs_info, parsed_blogs_info, author_name, author_ip, target_tags, tags_filter_mode, file_update_interval) else: print("开始获取归档页面数据,链接 %s (不能直接点开)" % archive_url) blog_infos = parse_archive_page(url=archive_url, data=data, header=head, author_url=author_url, query_num=query_num, start_time=start_time, end_time=end_time) parsed_blogs_info = get_file_contetn(dir_path + "/blogs_info_parsed.json") file_update(dir_path + "/blogs_info.json", blog_infos) print("归档页面数据保存完毕,开始解析博客页面") parse_blogs_info(blog_infos, parsed_blogs_info, author_name, author_ip, target_tags, tags_filter_mode, file_update_interval) print("博客解析完毕,开始图片下载") # 判断图片保存进度 if is_file_in(dir_path + "/imgs_info.json") == "finished": print("该作者首页的所有图片已保存完毕,无需操作") else: imgs_info = get_file_contetn(dir_path + "/imgs_info.json") imgs_info_saved = get_file_contetn(dir_path + "/imgs_info_saved.json") download_img(imgs_info, imgs_info_saved, author_name, author_ip, file_update_interval) print("所有图片保存完毕") deal_file("del") print("程序运行结束")
# -*- coding: utf-8 -*- """ Created on 2019/1/18 16:41 @Author: Johnson @Email:[email protected] @File: 星座.py """ import requests from lxml.html import etree import json import time # 导入模块 # 星座运势 response = requests.get('https://www.xzw.com/fortune/taurus/') if not response.status_code == 200: print('星座运势请求错误:' + str(response.status_code)) sel =etree.HTML(response.text) fortune = sel.xpath('//div[@class="c_box"]/div[@class="c_cont"]/p/span/text()')[0] print(fortune)
def parse_search_page(self, res, first_date, last_date): """解析对账单搜索页面,构建搜索条件""" ys, ms, ds = first_date.split('-') ye, me, de = last_date.split('-') html = etree.HTML(res.get('msg')) build_pay_load = {} ch_date = lambda x: x if len(x) == 2 else '0%s' % x for e in html.xpath('//input[@type="hidden"]'): name = e.xpath('./@name')[0] try: value = e.xpath('./@value')[0] except: value = "" value = quote(value.encode('gbk')) if name == 'Begin_date': value = ''.join([ys, ms, ch_date(ds)]) elif name == 'End_date': value = ''.join([ye, me, ch_date(de)]) elif name == 'Qry_date': value = ys build_pay_load[name] = value base_payload = { 'Corpor_id': '1', 'Account_num': '4000010109200194412', 'yearname1': ys, 'dayname1': ds, 'yearname2': ye, 'monthname2': me, 'dayname2': de, 'monthname1': ms, } build_pay_load.update(base_payload) build_pay_load = '&'.join( ['%s=%s' % (k, v) for k, v in build_pay_load.items()]) search_url = '%s/servlet/com.ibm.btt.cs.servlet.CSReqServlet' % self.base_url # 搜索对账单 res = self.deal_result(self.execute( 'POST', search_url, data=build_pay_load, content_type='application/x-www-form-urlencoded'), err_type='icbc') if not res.get('succ'): return res # 分页爬取数据 data_list = [] for p in range(20, 200, 20): logger.info('begin_pos:----------------------------%s' % p) data, html = self.crawler_down_list(res.get('msg')) if not data: continue data_list.extend(data) payload = self.build_next_payload(html, p) if not payload: time.sleep(0.25) continue res = self.deal_result( self.execute('POST', search_url, data=payload, content_type='application/x-www-form-urlencoded')) if res.get('succ'): time.sleep(0.25) continue time.sleep(0.25) else: return {'succ': True, 'data': data_list}
import requests from lxml.html import etree import json url = "http://www.lovehhy.net/Joke/Detail/QSBK/" uu = "http://www.foshannews.net/jtzb2016/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } req = requests.get(uu, headers=headers).content.decode('utf-8', 'ignore') #print(req) rsp = etree.HTML(req) #content = rsp.xpath('//div[@class="post_recommend_new"]//text()') items = [] for i in rsp.xpath('//ul[@class="mbd dot f14"]/li/a'): cc = i.xpath('./@href')[0].strip(".") title = i.xpath('./@title')[0] cc = "http://www.foshannews.net/jtzb2016/" + cc eq = requests.get(cc, headers=headers).content.decode('utf-8', 'ignore') sp = etree.HTML(eq) cont = sp.xpath('//div[@class="TRS_Editor"]/p') result = { '标题': title, '标题网站': cc, }
headers = { "cookie": "__cfduid=dfa5a44a56e1f4818da6dc1c0442d32e61555031717; _" "ga=GA1.2.446599568.1555031722; trc_cookie_storage=taboola%2520global%253Auser-id%3Df47e0355-c5e3-4ac8-8d9c-69e65b8be1c0-tuct3a468dd; " "ShowSubtitleDetails=true; ShowSubtitlePreview=true; " "HearingImpaired=2; ForeignOnly=False; _gid=GA1.2.1534139390.1556500043; LanguageFilter=28; " "cookieconsent_dismissed=yes; cf_clearance=5c22147cf3e89737a1f9ac602ed6b8491cc6bc33-1556588618-31536000-150", "pragma": "no-cache", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" } with open('url.txt', 'r', encoding='utf8') as f: urls = f.readlines() url_list = [] for index, url in enumerate(urls): url = url.strip() session = requests.session() resp = session.get(url=url, headers=headers) root = etree.HTML(resp.text) result = root.xpath('//div[@class="download"]/a/@href') result = ["https://subscene.com" + short_url for short_url in result] url_list.extend(result) with open('download_url.txt', 'w', encoding='utf8') as f: f.writelines("\n".join(url_list))
# @Time : 2020/1/7 10:41 # @Author : yangmingming # @Site : # @File : creawl_html.py # @Software: PyCharm import sys from PyQt5.QtWidgets import * from PyQt5.QtCore import * from PyQt5.QtWebKitWidgets import * from lxml.html import etree class WebRender(QWebPage): def __init__(self, url): self.app = QApplication(sys.argv) QWebPage.__init__(self) self.loadFinished.connect(self.__loadFinished) self.mainFrame().load(QUrl(url)) self.app.exec_() def __loadFinished(self, result): self.frame = self.mainFrame() self.app.quit() url = "https://www.baidu.com" r = WebRender(url) html = r.frame.toHtml() print(html) page = etree.HTML(html.encode('utf-8'))
def parse_one(self, url, i): print("开始爬取第 {} 页数据".format(i)) data_url = url + '&page=' + str(i) data = requests.get(data_url, headers=self.headers) html = etree.HTML(data.content.decode()) item_lis = html.xpath('//*[@id="plist"]/ul/li') data_list = [] for i, item in enumerate(item_lis): dic = {} image = item.xpath('./div/div[1]/a/img/@src') desc = item.xpath("./div/div[4]/a/em/text()")[0].strip() data_sku = item.xpath('./div/@data-sku')[0] venderid = item.xpath('./div/@venderid')[0] print("开始爬取商品 {} ".format(data_sku)) get_sku_url = "https://p.3.cn/prices/mgets?skuIds=J_{}".format( data_sku) # ft = Redis_filter() # d = ft.get(get_sku_url) # if d: # print('__________该商品已经存在!________跳过该商品') # continue # ft.save(get_sku_url) get_store_url = "https://rms.shop.jd.com/json/pop/shopInfo.action?ids={}".format( venderid) price_dic = requests.get(get_sku_url, headers=self.headers) stort_dic = requests.get(get_store_url, headers=self.headers) price = json.loads(price_dic.content.decode())[0]['p'] store = json.loads(stort_dic.text)[0]['name'] detail_url = item.xpath('./div/div[1]/a/@href')[0] detail_url = "https:{}".format(detail_url) detail_data = requests.get(detail_url, headers=self.headers) html = etree.HTML(detail_data.text) color = html.xpath('//*[@id="choose-attr-1"]/div/div/@data-value') version = html.xpath( '//*[@id="choose-attr-2"]/div/div/@data-value') w_url = "https://c0.3.cn/stock?skuId={}&area=15_1243_3419_0&venderId={}&choseSuitSkuIds=&cat=9987,653,655".format( data_sku, venderid) d = requests.get(w_url, headers=self.headers) weight = json.loads( d.content.decode("GBK"))['stock'].get("weightValue") dic['id'] = data_sku dic['image'] = image dic['price'] = price dic['description'] = desc dic['store'] = store dic['url'] = detail_url dic['color'] = color dic['version'] = version dic['weight'] = weight data_list.append(dic) print('商品{}爬取完成'.format(data_sku)) ft = Redis_filter() d = ft.get(dic) if d: print('__________该商品已经存在!________跳过该商品') continue ft.save(dic) self.save_to_mongo(dic)
# 要求: ''' 使用xpath 取到用户头像,用户名,用户性别,用户年龄 糗事内容 ''' import requests from lxml.html import etree url = "https://www.qiushibaike.com/" r = requests.get(url) html = etree.HTML(r.content.decode()) all_qiushi = html.xpath("//div[@id='content-left']/div") for one in all_qiushi: userImg = one.xpath("./div[1]/a[1]/img/@src") if userImg: userImg = "http:" + userImg[0] username = one.xpath("./div[1]/a[2]/h2/text()")[0] userage = one.xpath("./div[1]/div/text()")[0] usersex = one.xpath("./div[1]/div/@class")[0] usersex = usersex[14:-4] else: userImg = "https://static.qiushibaike.com/images/thumb/anony.png?v=b61e7f5162d14b7c0d5f419cd6649c87" username = "******" userage = "0" usersex = 'man' userQiushi = ''.join(one.xpath("./a[1]/div/span[1]/text()")).replace( "\n", '') userQiushiImg = one.xpath("./div[@class='thumb']/a/img/@src") if userQiushiImg:
print(details_url) sql = f'INSERT into book_toscrape(book_title,img_url,price,star_rating,availability,details_url) VALUES ("{books_name[i]}", "{img_url}", "{prices[i][1:]}", "{star_rating[i][12:]}","{availability}","{details_url}") ' cursor.execute(sql) conn.commit() cursor.close() url = "http://books.toscrape.com/" if __name__ == '__main__': pool = Pool(processes=4) results = [] url_list = [url] for i in url_list: response = requests.get(i) response.encoding = "utf8" if response.status_code == 200: print(f"{i}连接成功...") html_text = etree.HTML(response.text) next_url = html_text.xpath("//li[@class='next']/a/@href") result = pool.apply_async(Book_Toscrape(html_text, i)) if next_url != []: if i == "http://books.toscrape.com/": url1 = url + next_url[0] else: url1 = "http://books.toscrape.com/catalogue/" + next_url[0] url_list.append(url1) else: exit() pool.close() pool.join()
def fetch_chapter(chapter_name, url): """ 根据知识点来提取题目 :chapter_name 经文名 :url string 章节url """ print("经文提取开始:{0} {1}".format(chapter_name, url)) if redis.exists(PROCESSED_URLS) and redis.sismember(PROCESSED_URLS, url): print("经文已处理,跳过") return res = http_request(url) html = etree.HTML(res) # 提取卷名 chapter_names = "|".join(redis.hkeys(CHAPTERS)) volume = " ".join(html.xpath("//table[@class='content']//p[1]//text()")) if volume.isspace() or len(volume) == 0: # =_= 当页面没有p的时候,就只能全文匹配了,贼恶心 volume = " ".join(html.xpath("//table[@class='content']//text()")) try: volume = re.search(r"({0})".format(chapter_names), volume).group() except Exception as e: volume = "小部" volume_id = redis.hget(CHAPTERS, volume) # 提取内容 content = html.xpath("//table[@class='content']//p//text()") content = "\n".join(list(filter(lambda x: x != "\u3000\u3000", content))) [order, title] = re.search(r"(\d+-?\d?)\s+(.*)$", chapter_name).groups() item = { "title": title, "order": order, "chapter_id": volume_id, "content": "" } contents = [] contents.append(content) # 提取其他几页 LINK_PAGE_BASE = "http://www.chilin.edu.hk/edu/report_section_detail.asp" links = html.xpath("//td[@class='subtitle'][1]")[0].xpath( "./following-sibling::td[1]//a//@href") links = list(map(lambda x: LINK_PAGE_BASE + x, links)) for link in links: res = http_request(link) html = etree.HTML(res) content = html.xpath("//table[@class='content']//p//text()") if len(content) == 0: content = html.xpath("//table[@class='content']//td/text()") content = list( filter( lambda x: x == "\u3000\u3000" or not x.replace("|", ""). isspace(), content))[1:] content = "\n".join( list(map(lambda x: "\n" if x == "\u3000\u3000" else x, content))) contents.append(content) item["content"] = "\n".join(contents) # 插入经文 row = db.select_one( "SELECT * FROM `book_article` WHERE `chapter_id`=%s and `title`=%s", (volume_id, item["title"])) if row is not None: print("{0} 经文已存在,无法插入!".format(item["title"])) redis.sadd(PROCESSED_URLS, url) return sql = """ INSERT INTO `book_article` ( `chapter_id`, `title`, `content`, `order`) VALUES ( %s, %s, %s, %s) """ data = (volume_id, item["title"], item["content"], item["order"]) db.insert(sql, data) redis.sadd(PROCESSED_URLS, url) print("此经文已插入完毕")
def parse_laws_data(self, response): """ #hao classes_num = scrapy.Field() #中文标题 chinese_title = scrapy.Field() #库别名称 base_name = scrapy.Field() #颁布部门 issu_department = scrapy.Field() #效力级别 level = scrapy.Field() #时效性 timeliness = scrapy.Field() #颁布日期 issu_date = scrapy.Field() #实施日期 doit_date=scrapy.Field() #内容分类 content_classes = scrapy.Field() :param response: :return: """ item = LawsItem() item['classes_num'] = 0 item['chinese_title'] = response.xpath( './/div[@class="left_con_top"]/div[@class="title"]/text()' ).extract_first('暂无').replace('\r', '').replace('\t', '').replace('\n', '') list_info = response.xpath( '//div[@class="left_con_top"]/ul//li/div[1]/text()').extract() print(list_info) if '库别名称:' in list_info: panten = re.compile( '<div\sclass="info_left">库别名称:</div>.*?<div\sclass="info_right.*?">(.*?)</div>', re.S) data = re.findall(panten, response.text) dssss = etree.HTML(data[0]) item['base_name'] = ','.join(dssss.xpath('//text()')).replace( '\t', '').replace(' ', '').replace('\r', '').replace('\n', '') if '颁布部门:' in list_info: panten = re.compile( '<div\sclass="info_left">颁布部门:</div>.*?<div\sclass="info_right.*?">(.*?)</div>', re.S) data = re.findall(panten, response.text) dssss = etree.HTML(data[0]) item['issu_department'] = ','.join( dssss.xpath('//text()')).replace('\t', '').replace( ' ', '').replace('\r', '').replace('\n', '') if '效力级别:' in list_info: panten = re.compile( '<div\sclass="info_left">效力级别:</div>.*?<div\sclass="info_right.*?">(.*?)</div>', re.S) data = re.findall(panten, response.text) dssss = etree.HTML(data[0]) item['level'] = ','.join(dssss.xpath('//text()')).replace( '\t', '').replace(' ', '').replace('\r', '').replace('\n', '') if '时效性:' in list_info: panten = re.compile( '<div\sclass="info_left">时效性:</div>.*?<div\sclass="info_right.*?">(.*?)</div>', re.S) data = re.findall(panten, response.text) dssss = etree.HTML(data[0]) item['timeliness'] = ','.join(dssss.xpath('//text()')).replace( '\t', '').replace(' ', '').replace('\r', '').replace('\n', '') if '颁布日期:' in list_info: panten = re.compile( '<div\sclass="info_left">颁布日期:</div>.*?<div\sclass="info_right.*?">(.*?)</div>', re.S) data = re.findall(panten, response.text) dssss = etree.HTML(data[0]) item['issu_date'] = ','.join(dssss.xpath('//text()')).replace( '\t', '').replace(' ', '').replace('\r', '').replace('\n', '') if '实施日期:' in list_info: panten = re.compile( '<div\sclass="info_left">实施日期:</div>.*?<div\sclass="info_right.*?">(.*?)</div>', re.S) data = re.findall(panten, response.text) dssss = etree.HTML(data[0]) item['doit_date'] = ','.join(dssss.xpath('//text()')).replace( '\t', '').replace(' ', '').replace('\r', '').replace('\n', '') if '内容分类:' in list_info: panten = re.compile( '<div\sclass="info_left">内容分类:</div>.*?<div\sclass="info_right.*?">(.*?)</div>', re.S) data = re.findall(panten, response.text) dssss = etree.HTML(data[0]) item['content_classes'] = ','.join( dssss.xpath('//text()')).replace('\t', '').replace( ' ', '').replace('\r', '').replace('\n', '') yield item
def getMovieById(request): print("############################") print("getMovieById") id = request.GET.get('id', '') print(id) m_list = Movie.objects.filter(m_id=id) if len(m_list) == 0: messages.error(request, '电影不存在!') return JsonResponse({'code': 0}) res = [] for i in m_list: x = Movie_() x.movieId = int(i.m_id) x.name = i.m_name x.actors = i.actor x.cover = i.imgurl x.directors = i.director x.genres = i.type + ' ' + i.actor x.officialSite = 'https://v.qq.com/' x.regions = i.area x.languages = i.language x.mins = i.length x.score = i.rate / 10.0 x.tags = i.type x.year = '' try: if req.urlopen(i.imgurl).status != 200: i.imgurl = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1589887458475&di=38b6dbf53b6505b7a5cb3764c1857313&imgtype=0&src=http%3A%2F%2Fimg3.doubanio.com%2Fview%2Fgroup_topic%2Flarge%2Fpublic%2Fp108048762.jpg' except: i.imgurl = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1589887458475&di=38b6dbf53b6505b7a5cb3764c1857313&imgtype=0&src=http%3A%2F%2Fimg3.doubanio.com%2Fview%2Fgroup_topic%2Flarge%2Fpublic%2Fp108048762.jpg' try: response = requests.get('https://movie.douban.com/subject/' + i.m_id + '/', headers=headers) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, "html.parser") x.releaseDate = soup.find("span", attrs={ "property": "v:initialReleaseDate" }).get_text() x.releaseDate = x.releaseDate.split('(')[0] x.storyline = soup.find("span", attrs={ "property": "v:summary" }).get_text() x.votes = int( soup.find("span", attrs={ "property": "v:votes" }).get_text()) x.cover = etree.HTML(response.text).xpath('//img/@src')[0] except: x.storyline = '亲爱的用户,很抱歉未获取到相应数据。' x.releaseDate = '0000-00-00' x.votes = 2032805 x.cover = i.imgurl x.actorIds = '' x.directorIds = '' res.append(x) # m_list = serializers.serialize("json", res) m_list = json.dumps(res, default=lambda obj: obj.__dict__) return JsonResponse({'code': 1, 'm_list': m_list})
def Spider(self): jobl = [] for page in range(self.page): params = { "start": 90 * page, "pageSize": 90, "workExperience": -1, "education": -1, "companyType": -1, "employmentType": -1, "jobWelfareTag": -1, "kw": self.keyword, "kt": 3, "cityId": self.city, "salary": '0, 0' } req = requests.get(url=self.base_url, params=params, headers=get_header()) cookie = req.cookies print(cookie) data = req.json()['data']['results'] if len(data) != 0: for job in data: # print(job) jobd = {} jobd['ID'] = job.get('number') jobd['工作名称'] = job.get('jobName') jobd['招聘详细链接'] = job.get('positionURL') company = job.get('company') jobd['公司名称'] = company.get('name') jobd['公司ID'] = company.get('number') jobd['公司性质'] = company.get('type').get('name') jobd['公司规模'] = company.get('size').get('name') jobd['公司招聘主页'] = company.get('url') jobd['公司地点'] = job.get('city').get('display') jobd['薪资'] = job.get('salary') jobd['学历要求'] = job.get('eduLevel').get('name') try: jobd['工作经历'] = job.get('workingExp').get('name') except: jobd['工作经历'] = '经验不限' jobd['职位类型'] = job.get('emplType') jobd['公司福利'] = '、'.join(job.get('welfare')) or '无' jobd['工作发布标签'] = job.get('timeState') jobd['更新时间'] = job.get('updateDate') header = get_header() header['referer'] = job.get('positionURL') header['upgrade-insecure-requests'] = '1' header['cookie'] = config.ZHILIAN_COOKIE req1 = requests.get( job.get('positionURL'), headers=header, ) req1.encoding = 'utf-8' html = etree.HTML(req1.text) detail = ''.join( html.xpath( '//*[@class="describtion__detail-content"]//*/text()' )) if not detail: detail = ''.join( html.xpath( '//*[@class="describtion__detail-content"]/text()' )) print(job.get('positionURL')) print(detail) jobd['职位描述'] = detail.strip() jobl.append(jobd) else: break return jobl
def prase2(txt): html0 = etree.HTML(txt) list1 = html0.xpath('//*[@id="content"]/p/text()') return list1