def parse_items(self,response): html = etree.HTML(response["response"]) city = response["city"] item_data = html.xpath('//div[@_soj="xqlb"]') next_item = html.xpath('//div[@class="multi-page"]/a') if next_item: if "下一页" in next_item[-1].xpath('text()')[0]: next_url = next_item[-1].xpath('@href') if next_url: log.log("下一页链接 "+next_url[0],"debug") next_task =Sin_Req(next_url[0],self.parse_items,meta={"city":city}) task_queue.add_task(next_task) if item_data: for i in item_data: item = dict() item["city"] = city item["title"] = i.xpath('a/@title') item["link"] = i.xpath('a/@href') item["address"] = i.xpath('div[@class="li-info"]/address/text()') item["year"] = i.xpath('div[@class="li-info"]/p[@class="date"]/text()') item["price"] = i.xpath('div[@class="li-side"]/p[1]/strong/text()') if item["link"]: commid = re.findall(r'view/(.*?)$',item["link"][0]) if commid: comm_link = self.base_comm_link+commid[0] task_queue.add_task(Sin_Req(comm_link,self.parse_comm_trend,meta={"commid":commid})) log.log(str(item),"info") mon.insert("ajk_xiaoqu",item)
def parse_arti_data(self,response): """ 解析所发布的文章数据,并构造文章的评论的请求 :param response: :return: """ data = json.loads(response["response"]) add_headers = response["add_headers"] if 'paging' in data: if not data["paging"]["is_end"]: print(data) for i in data["data"]: log.log(str(i),"info") # mon.insert("data",data) if i["target"]: if i["target"]["id"]: article_id = i["target"]["id"] page = 0 comm_task = Sin_Req(self.comment_base_url.format(id = article_id,page = 20*page),callback=self.parse_comment,meta={"add_headers":add_headers,"page":page,"article_id":article_id}) comm_task.headers.update(add_headers) # print(comm_task.headers) task_queue.add_task(comm_task) next_link = data["paging"]["next"] log.log("下一个文章链接 "+ next_link,"debug") s = Sin_Req(next_link,callback=self.parse_arti_data,meta={"add_headers":add_headers}) s.headers.update(add_headers) print(s.headers) task_queue.add_task(s) else: for i in data["data"]: log.log(str(i),"info") mon.insert("data",data)
def parse_item(self,response): html = etree.HTML(response["response"]) items = html.xpath('//ul[@class="row-fluid list-row js-car-list"]/li') log.log(str(items), "debug") if items: for i in items: data = dict() link = i.xpath('a/@href') car_id = i.xpath('a/@data-car-id') title = i.xpath('div[@class="schedule btn-base btn-wireframe"]/@data-title') city = i.xpath('a/div[@class="img-backgound"]/div[@class="position-bg"]/span/text()') year_age = i.xpath('a/div[@class="mileage"]/span[1]/text()') price = i.xpath('a/div[@class="tags-box"]/div[@class="price"]/text()') data["car_id"] = car_id data["title"] = title data["link"] = link data["city"] = city data["year_age"] = year_age data["price"] = price log.log(str(data),"info") mon.insert("data",data) if link: detail_link = urljoin(self.base_url,link[0]) task_queue.add_task(Sin_Req(detail_link,callback=self.parse_detail,meta={"data_id":car_id})) log.log("detail_link "+detail_link,"debug") base_link = response["base_link"] next_page = response["now_page"] + 1 next_link = base_link+"p"+str(next_page)+r"/" log.log(next_link,"debug") task_queue.add_task(Sin_Req(next_link,callback=self.parse_item,meta={"base_link":base_link,"now_page":next_page}))
def parse_follow_lst(self, response): offset = response["offset"] print(offset) try: data = json.loads(response["response"]) log.log(str(data), "info") mon.insert("follower_lst", data) if data["data"]: print("BMW") for j in data["data"]: user_id = j["id"] log.log(str(j), "info") log.log(str(user_id), "debug") detal_task = Sin_Req( self.base_user_url.format(u_id=user_id), callback=self.parse_user_detail) detal_task.headers.update(self.add_headers) task_queue.add_task(detal_task) if data["paging"]: if not data["paging"]["is_end"]: next_task = Sin_Req(self.base_url.format(10 * (offset + 1)), self.parse_follow_lst, meta={"offset": offset + 1}) next_task.headers.update(self.add_headers) task_queue.add_task(next_task) except: log.log(str(offset), "error")
def parse_items(self, response): """ 帖子的抓取 :param response: :return: """ html = etree.HTML(response["response"]) items = html.xpath('//dl[@class="list_dl"]') next_items = html.xpath('//a[@class="page_down"]/@href') log.log(str(items), "debug") car_name = response["car"] if next_items: if items: for i in items: data = {} title = i.xpath('dt/p[@class="thenomal"]/a/text()') link = i.xpath('dt/p[@class="thenomal"]/a/@href') post_time = i.xpath( 'dd[@class="w98"]/span[@class="tdate"]/text()') comment_nums = i.xpath( 'dd[@class="cli_dd"]/span[@class="fontblue"]/text()') views_nums = i.xpath( 'dd[@class="cli_dd"]/span[@class="tcount"]/text()') data["car"] = car_name data["title"] = title data["link"] = link data["post_time"] = post_time data["comment_nums"] = comment_nums data["views_nums"] = views_nums log.log(str(data), "info") mon.insert("data", data) next_link = "http://www.xcar.com.cn/bbs/" + next_items[0] log.log("下一页连接 " + next_link, "debug") task_queue.add_task( Sin_Req(next_link, self.parse_items, meta={"car": car_name}))
def parse_brand(self, response): """ 解析得到汽车品牌的基本信息 :param response: :return: """ try: data = json.loads(response["response"]) for i in data["data"]["option"]: log.log(str(i), "info") mon.insert("car_info", i) car_name = i["text"] page = 1 task_link = self.base_items_link.format(series=i["tag_url"], page=page) # log.log(task_link,"debug") item_task = Sin_Req(task_link, self.parse_item, meta={ "cur_link": task_link, "car_name": car_name, "page": page, "series": i["tag_url"] }) task_queue.add_task(item_task) except Exception as e: log.log(str(e), "error")
def parse_zones(self, response): city_name = response["city_name"] base_link = response["city_link"] html = etree.HTML(response["response"]) items = html.xpath('//ul[@class="search-area-detail clearfix"]/li')[1:] log.log("ZONE " + str(items), "debug") for i in items: zone_ = i.xpath('a/text()') link_ = i.xpath('a/@href') if zone_: data = {} data["zone_name"] = zone_[0] data["zone_link"] = link_[0] log.log(str(data), "info") task_queue.add_task( Sin_Req("https:" + urljoin(base_link, link_[0]), self.parse_details, meta={ "city_name": city_name, "zone_name": zone_[0], "zone_base_url": "https:" + urljoin(base_link, link_[0]) }))
def parse_items(self,response): """ 解析网页数据 :param response:返回请求成功的HTML :return: 下一页连接重新封装成Request对象放入待抓取队列,解析出的数据持久化到MongoDB """ html = etree.HTML(response["response"]) city = response["city"] item_data = html.xpath('//div[@class="zu-itemmod "]') next_item = html.xpath('//div[@class="multi-page"]/a') if next_item: if "下一页" in next_item[-1].xpath('text()')[0]: next_url = next_item[-1].xpath('@href') if next_url: log.log("下一页链接 "+next_url[0],"debug") next_task =Sin_Req(next_url[0],self.parse_items,meta={"city":city}) task_queue.add_task(next_task) if item_data: for i in item_data: item = dict() item["city"] = city item["title"] = i.xpath('a/@title') item["link"] = i.xpath('a/@href') item["size_style"] = i.xpath('div[@class="zu-info"]/p[@class="details-item tag"]/span[1]/text()') item["price"] = i.xpath('div[@class="zu-side"]/p/strong/text()') item["name"] =i.xpath('div[@class="zu-info"]/address/a/text()') item["address"] = i.xpath('div[@class="zu-info"]/address/text()') log.log(str(item),"info") mon.insert("ajk_zufang",item)
def parse_details(self,response): city_name = response["city_name"] zone_name = response["zone_name"] html = etree.HTML(response["response"]) items = html.xpath('//div[@class="house-detail"]/ul/li') next_items = html.xpath('//div[@class="pages-box clearfix"]/div[2]/a[@class="turnpage_next"]') if next_items: if "下一页" in next_items[0].xpath('span/text()')[0]: next_page = next_items[0].xpath('@href') log.log(str(next_page),"debug") if next_page: zone_base_url = response["zone_base_url"] next_link = urljoin(zone_base_url,next_page[0]) log.log(next_link,"info") task_queue.add_task(Sin_Req(next_link,self.parse_details,meta={"city_name":city_name,"zone_name":zone_name,"zone_base_url":zone_base_url})) if items: for i in items: data = {} title = i.xpath('div[1]/p[@class="house-title"]/a/@title') link = i.xpath('a/@href') base_info = i.xpath('div[1]/p[@class="house-about clearfix"]/span/text()') address = i.xpath('div[1]/p[@class="house-address clearfix"]/span[@class="whole-line"]/a/text()') price = i.xpath('div[@class="show-price"]/span[@class="sale-price"]/text()') uni_price = i.xpath('div[@class="show-price"]/p/text()') data["title"] = title data["link"] = link data["base_info"] = base_info data["address"] = address data["price"] = price data["uni_price"] = uni_price log.log(str(data),"info") mon.insert("data",data)
def start_req(self): offset = 0 task = Sin_Req(self.base_url.format(offset * 10), callback=self.parse_follow_lst, meta={"offset": offset}) task.headers.update(self.add_headers) task_queue.add_task(task)
def parse_item(self, response): """ 二手车列表页的抓取 :param response: :return: """ page = response["page"] cur_link = response["cur_link"] series = response["series"] car_name = response["car_name"] try: req_data = json.loads(response["response"]) if req_data["data"]["thisCity"]: data = dict() data["car_name"] = car_name data["item_info"] = req_data["data"]["thisCity"] task_queue.old_task(cur_link) log.log(str(data), "info") mon.insert("item_data", data) next_link = self.base_items_link.format(series=series, page=page + 1) log.log("下一页连接 " + next_link, "debug") next_task = Sin_Req(next_link, self.parse_item, meta={ "cur_link": cur_link, "car_name": car_name, "page": page, "series": series }) except Exception as e: log.log(str(e), "error")
def start_req(self): for k in self.task.keys(): city = k link = self.task[city] task_queue.add_task( Sin_Req(link + r'sale/', self.parse_items, meta={"city": city}))
def parse_items(self, response): html = etree.HTML(response["response"]) city_name = response["city_name"] items = html.xpath('//li[@class="con caritem conHeight"]') next_item = html.xpath('//div[@class="con-page search_page_link"]/a') log.log("下一页条目 " + str(next_item), "debug") for j in items: data = dict() title = j.xpath('@data-title') car_id = j.xpath('@data-carid') price = j.xpath('@data-price') link = j.xpath('div[@class="across"]/a/@href') city_id = j.xpath('div[@class="across"]/a/@data-cityid') brand_id = j.xpath('div[@class="across"]/a/@data-brandid') seriesid = j.xpath('div[@class="across"]/a/@data-seriesid') year_age = j.xpath( 'div[@class="across"]/div[@class="pad"]/span[1]/text()') data["title"] = title data["car_id"] = car_id data["price"] = price data["link"] = link data["city_id"] = city_id data["city_name"] = city_name data["brand_id"] = brand_id data["series_id"] = seriesid data["year"] = year_age[0] data["age"] = year_age[1] log.log(str(data), "info") mon.insert("item_data", data) if link: if car_id: report_task = Sin_Req(self.base_report + car_id[0], self.parse_report, meta={"car_id": car_id}) report_task.headers["Referer"] = urljoin( self.base_url, link[0]) task_queue.add_task(report_task) if next_item: next_page = next_item[-1].xpath('text()') if next_page: if "下一页" in next_page[0]: next_link = next_item[-1].xpath('@href')[0] log.log("下一页 " + next_link, "debug") task_queue.add_task( Sin_Req(urljoin(self.base_url, next_link), callback=self.parse_items, meta={"city_name": city_name}))
def start_req(self): """ 依据查询关键字形成初始请求 :return: """ for k in KEYS: start_task = Sin_Req(self.base_url + quote(k), self.parse_lst) task_queue.add_task(start_task)
def start_req(self): """ 任务初始化 :return: 把初始种子url封装成Request对象放进任务队列 """ for k in self.task.keys(): city = k link = self.task[city].replace(r'anjuke',r'zu.anjuke') task_queue.add_task(Sin_Req(link,self.parse_items,meta={"city":city}))
def start_req(self): for i in self.start_urls: start_page = 1 task_queue.add_task( Sin_Req(i + "p" + str(start_page) + r"/", callback=self.parse_item, meta={ "now_page": start_page, "base_link": i }))
def start_req(self): """ 任务的初始化 :return:把初始任务url封装成Request放入任务队列 """ for k in self.task.keys(): city = k link = self.task[city] task_queue.add_task( Sin_Req(link + r'sale/', self.parse_items, meta={"city": city}))
def start_req(self): """ 初始化任务队列 :return: """ for k in self.cars.keys(): car_name = k car_base_link = self.cars[k] task_queue.add_task( Sin_Req(car_base_link, self.parse_items, meta={"car": car_name}))
def article_start_req(self): """ 构造动态加载的文章列表数据请求 :return: """ add_headers = { 'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20', 'referer': 'https://www.zhihu.com/org/shu-ju-bing-shan/activities' } s = Sin_Req(self.article_start_url,callback=self.parse_arti_data,meta={"add_headers":add_headers}) s.headers.update(add_headers) task_queue.add_task(s)
def parse_lst(self, response): """ 列表页解析出详情页url,并解析出下一页的请求 :param response: :return: """ html = etree.HTML(response["response"]) items = html.xpath('//div[@id="infolist"]/dl') for i in items: detail_url = i.xpath('dt[@class="w325"]/a/@href') log.log("详情页 " + detail_url[0], "debug") cleaned_detail_url = "https://" + detail_url[0][2:] detail_task = Sin_Req( cleaned_detail_url.replace('single', "singles").replace( r'/?psid', '?psid'), self.parse_detail) # print("https://"+detail_url[0][2:]) task_queue.add_task(detail_task) next_page = html.xpath('//a[@class="next"]/@href') if next_page: log.log("下一页的链接 " + next_page[0], "debug") next_task = Sin_Req(next_page[0], self.parse_lst) task_queue.add_task(next_task)
def start_req(self): """ 对每个车型的首页列表页进行初始化请求 :return: """ r = requests.get("https://www.xin.com/apis/Ajax_common/get_home_city/?cityid=201") data = json.loads(r.text) for i in data["data"]["city_all"]: city = data["data"]["city_all"][i]["ename"] city_name =data["data"]["city_all"][i]["cityname"] for i in self.cars: start_link = urljoin(self.base_url,city+r'/'+i) log.log(start_link,"debug") task_queue.add_task(Sin_Req(start_link,callback=self.parse_items,meta = {"city_name":city_name}))
def parse_start(self,response): html = etree.HTML(response["response"]) items = html.xpath('//ul[@class="cities-opts clearfix"]/li') log.log(str(items), "debug") for i in items: a_items = i.xpath('p/a[@class="highlight"]') log.log(str(a_items), "debug") if a_items: for j in a_items: data = dict() name = j.xpath('text()')[0] link = j.xpath('@href')[0] data["city_name"] = name data["city_link"] = link mon.insert("city_data",data) log.log(str(data)+" ","info") task_queue.add_task(Sin_Req(url="https:"+link,callback=self.parse_zones,meta={"city_link":link,"city_name":name}))
def parse_items(self, response): """ 解析出列表页的数据 :param response:来自列表页的响应数据 :return:对存在下一页的连接重新封装成放入待爬取队列,解析出的数据持久化至MongoDB """ html = etree.HTML(response["response"]) city = response["city"] item_data = html.xpath('//ul[@id="houselist-mod-new"]/li') next_item = html.xpath('//div[@class="multi-page"]/a') if next_item: if "下一页" in next_item[-1].xpath('text()')[0]: next_url = next_item[-1].xpath('@href') if next_url: log.log("下一页链接 " + next_url[0], "debug") next_task = Sin_Req(next_url[0], self.parse_items, meta={"city": city}) task_queue.add_task(next_task) if item_data: for i in item_data: item = dict() item["city"] = city item["title"] = i.xpath( 'div[@class="house-details"]/div[1]/a/@title') item["link"] = i.xpath( 'div[@class="house-details"]/div[1]/a/@href') item["room_style"] = i.xpath( 'div[@class="house-details"]/div[2]/span[1]/text()') item["size"] = i.xpath( 'div[@class="house-details"]/div[2]/span[2]/text()') item["price"] = i.xpath( 'div[@class="pro-price"]/span[@class="price-det"]/strong/text()' ) item["uni_price"] = i.xpath( 'div[@class="pro-price"]/span[@class="unit-price"]/text()') item["year"] = i.xpath( 'div[@class="house-details"]/div[2]/span')[-2].xpath( 'text()') item["name_location"] = i.xpath( 'div[@class="house-details"]/div[3]/span/@title') log.log(str(item), "info") mon.insert("ajk_ershou", item)
def parse_comment(self,response): """ 解析每个文章的评论内容 :param response: :return: """ page = response["page"] article_id = response["article_id"] add_headers = response["add_headers"] try: data = json.loads(response["response"]) if "paging" in data: if not data["paging"]["is_end"]: log.log(str(data),"info") mon.insert("comment_data",data) next_comment_task = Sin_Req(self.comment_base_url.format(id = article_id,page = 20*(page+1)),callback=self.parse_comment,meta={"add_headers":add_headers,"page":page+1,"article_id":article_id}) next_comment_task.headers.update(add_headers) task_queue.add_task(next_comment_task) else: log.log("Last Comment Data"+str(data),"info") mon.insert("comment_data",data) except Exception as e: log.log(str(e),"error")
def start_req(self): for i in range(22): #22 task_link = self.base_url.format(i * 12) task_queue.add_task(Sin_Req(task_link, self.parse_items))
def start_req(self): for k in self.task.keys(): city = k link = self.task[city].replace(r'anjuke', r'zu.anjuke') task_queue.add_task( Sin_Req(link, self.parse_items, meta={"city": city}))
from m_queue import TaskQueue from lxml import etree tt = TaskQueue() from req import Sin_Req url = "https://sz.lianjia.com/ershoufang/pg2/" def A(response): html = etree.HTML(response["response"]) title = html.xpath('//title/text()') print(title) rr = Sin_Req(url=url, callback=A) import pickle tt.add_task(rr) a = tt.pop_task() a.get()
def start_req(self): s = Sin_Req(self.start_urls, self.parse_start) s.get()