コード例 #1
0
    def parse_items(self,response):
        html = etree.HTML(response["response"])
        city = response["city"]
        item_data = html.xpath('//div[@_soj="xqlb"]')
        next_item = html.xpath('//div[@class="multi-page"]/a')
        if next_item:
            if "下一页" in next_item[-1].xpath('text()')[0]:
                next_url = next_item[-1].xpath('@href')
                if next_url:
                    log.log("下一页链接   "+next_url[0],"debug")
                    next_task =Sin_Req(next_url[0],self.parse_items,meta={"city":city})
                    task_queue.add_task(next_task)

        if item_data:
            for i in item_data:
                item = dict()
                item["city"] = city
                item["title"] = i.xpath('a/@title')
                item["link"] = i.xpath('a/@href')
                item["address"] = i.xpath('div[@class="li-info"]/address/text()')
                item["year"] = i.xpath('div[@class="li-info"]/p[@class="date"]/text()')
                item["price"] = i.xpath('div[@class="li-side"]/p[1]/strong/text()')
                if item["link"]:
                    commid = re.findall(r'view/(.*?)$',item["link"][0])
                    if commid:
                        comm_link = self.base_comm_link+commid[0]
                        task_queue.add_task(Sin_Req(comm_link,self.parse_comm_trend,meta={"commid":commid}))
                log.log(str(item),"info")
                mon.insert("ajk_xiaoqu",item)
コード例 #2
0
 def parse_arti_data(self,response):
     """
     解析所发布的文章数据,并构造文章的评论的请求
     :param response:
     :return:
     """
     data = json.loads(response["response"])
     add_headers = response["add_headers"]
     if 'paging' in data:
         if not data["paging"]["is_end"]:
             print(data)
             for i in data["data"]:
                 log.log(str(i),"info")
                 # mon.insert("data",data)
                 if i["target"]:
                     if  i["target"]["id"]:
                         article_id = i["target"]["id"]
                         page = 0
                         comm_task = Sin_Req(self.comment_base_url.format(id = article_id,page = 20*page),callback=self.parse_comment,meta={"add_headers":add_headers,"page":page,"article_id":article_id})
                         comm_task.headers.update(add_headers)
                         # print(comm_task.headers)
                         task_queue.add_task(comm_task)
             next_link = data["paging"]["next"]
             log.log("下一个文章链接   "+ next_link,"debug")
             s = Sin_Req(next_link,callback=self.parse_arti_data,meta={"add_headers":add_headers})
             s.headers.update(add_headers)
             print(s.headers)
             task_queue.add_task(s)
         else:
             for i in data["data"]:
                 log.log(str(i),"info")
                 mon.insert("data",data)
コード例 #3
0
 def parse_item(self,response):
     html = etree.HTML(response["response"])
     items = html.xpath('//ul[@class="row-fluid list-row js-car-list"]/li')
     log.log(str(items), "debug")
     if items:
         for i in items:
             data = dict()
             link = i.xpath('a/@href')
             car_id = i.xpath('a/@data-car-id')
             title = i.xpath('div[@class="schedule btn-base btn-wireframe"]/@data-title')
             city = i.xpath('a/div[@class="img-backgound"]/div[@class="position-bg"]/span/text()')
             year_age = i.xpath('a/div[@class="mileage"]/span[1]/text()')
             price = i.xpath('a/div[@class="tags-box"]/div[@class="price"]/text()')
             data["car_id"] = car_id
             data["title"] = title
             data["link"] = link
             data["city"] = city
             data["year_age"] = year_age
             data["price"] = price
             log.log(str(data),"info")
             mon.insert("data",data)
             if link:
                 detail_link = urljoin(self.base_url,link[0])
                 task_queue.add_task(Sin_Req(detail_link,callback=self.parse_detail,meta={"data_id":car_id}))
                 log.log("detail_link   "+detail_link,"debug")
         base_link = response["base_link"]
         next_page = response["now_page"] + 1
         next_link = base_link+"p"+str(next_page)+r"/"
         log.log(next_link,"debug")
         task_queue.add_task(Sin_Req(next_link,callback=self.parse_item,meta={"base_link":base_link,"now_page":next_page}))
コード例 #4
0
 def parse_follow_lst(self, response):
     offset = response["offset"]
     print(offset)
     try:
         data = json.loads(response["response"])
         log.log(str(data), "info")
         mon.insert("follower_lst", data)
         if data["data"]:
             print("BMW")
             for j in data["data"]:
                 user_id = j["id"]
                 log.log(str(j), "info")
                 log.log(str(user_id), "debug")
                 detal_task = Sin_Req(
                     self.base_user_url.format(u_id=user_id),
                     callback=self.parse_user_detail)
                 detal_task.headers.update(self.add_headers)
                 task_queue.add_task(detal_task)
         if data["paging"]:
             if not data["paging"]["is_end"]:
                 next_task = Sin_Req(self.base_url.format(10 *
                                                          (offset + 1)),
                                     self.parse_follow_lst,
                                     meta={"offset": offset + 1})
                 next_task.headers.update(self.add_headers)
                 task_queue.add_task(next_task)
     except:
         log.log(str(offset), "error")
コード例 #5
0
 def parse_items(self, response):
     """
     帖子的抓取
     :param response:
     :return:
     """
     html = etree.HTML(response["response"])
     items = html.xpath('//dl[@class="list_dl"]')
     next_items = html.xpath('//a[@class="page_down"]/@href')
     log.log(str(items), "debug")
     car_name = response["car"]
     if next_items:
         if items:
             for i in items:
                 data = {}
                 title = i.xpath('dt/p[@class="thenomal"]/a/text()')
                 link = i.xpath('dt/p[@class="thenomal"]/a/@href')
                 post_time = i.xpath(
                     'dd[@class="w98"]/span[@class="tdate"]/text()')
                 comment_nums = i.xpath(
                     'dd[@class="cli_dd"]/span[@class="fontblue"]/text()')
                 views_nums = i.xpath(
                     'dd[@class="cli_dd"]/span[@class="tcount"]/text()')
                 data["car"] = car_name
                 data["title"] = title
                 data["link"] = link
                 data["post_time"] = post_time
                 data["comment_nums"] = comment_nums
                 data["views_nums"] = views_nums
                 log.log(str(data), "info")
                 mon.insert("data", data)
         next_link = "http://www.xcar.com.cn/bbs/" + next_items[0]
         log.log("下一页连接  " + next_link, "debug")
         task_queue.add_task(
             Sin_Req(next_link, self.parse_items, meta={"car": car_name}))
コード例 #6
0
 def parse_brand(self, response):
     """
     解析得到汽车品牌的基本信息
     :param response:
     :return:
     """
     try:
         data = json.loads(response["response"])
         for i in data["data"]["option"]:
             log.log(str(i), "info")
             mon.insert("car_info", i)
             car_name = i["text"]
             page = 1
             task_link = self.base_items_link.format(series=i["tag_url"],
                                                     page=page)
             # log.log(task_link,"debug")
             item_task = Sin_Req(task_link,
                                 self.parse_item,
                                 meta={
                                     "cur_link": task_link,
                                     "car_name": car_name,
                                     "page": page,
                                     "series": i["tag_url"]
                                 })
             task_queue.add_task(item_task)
     except Exception as e:
         log.log(str(e), "error")
コード例 #7
0
 def parse_zones(self, response):
     city_name = response["city_name"]
     base_link = response["city_link"]
     html = etree.HTML(response["response"])
     items = html.xpath('//ul[@class="search-area-detail clearfix"]/li')[1:]
     log.log("ZONE   " + str(items), "debug")
     for i in items:
         zone_ = i.xpath('a/text()')
         link_ = i.xpath('a/@href')
         if zone_:
             data = {}
             data["zone_name"] = zone_[0]
             data["zone_link"] = link_[0]
             log.log(str(data), "info")
             task_queue.add_task(
                 Sin_Req("https:" + urljoin(base_link, link_[0]),
                         self.parse_details,
                         meta={
                             "city_name":
                             city_name,
                             "zone_name":
                             zone_[0],
                             "zone_base_url":
                             "https:" + urljoin(base_link, link_[0])
                         }))
コード例 #8
0
 def parse_items(self,response):
     """
     解析网页数据
     :param response:返回请求成功的HTML
     :return: 下一页连接重新封装成Request对象放入待抓取队列,解析出的数据持久化到MongoDB
     """
     html = etree.HTML(response["response"])
     city = response["city"]
     item_data = html.xpath('//div[@class="zu-itemmod  "]')
     next_item = html.xpath('//div[@class="multi-page"]/a')
     if next_item:
         if "下一页" in next_item[-1].xpath('text()')[0]:
             next_url = next_item[-1].xpath('@href')
             if next_url:
                 log.log("下一页链接   "+next_url[0],"debug")
                 next_task =Sin_Req(next_url[0],self.parse_items,meta={"city":city})
                 task_queue.add_task(next_task)
     if item_data:
         for i in item_data:
             item = dict()
             item["city"] = city
             item["title"] = i.xpath('a/@title')
             item["link"] = i.xpath('a/@href')
             item["size_style"] = i.xpath('div[@class="zu-info"]/p[@class="details-item tag"]/span[1]/text()')
             item["price"] = i.xpath('div[@class="zu-side"]/p/strong/text()')
             item["name"] =i.xpath('div[@class="zu-info"]/address/a/text()')
             item["address"] = i.xpath('div[@class="zu-info"]/address/text()')
             log.log(str(item),"info")
             mon.insert("ajk_zufang",item)
コード例 #9
0
 def parse_details(self,response):
     city_name = response["city_name"]
     zone_name = response["zone_name"]
     html = etree.HTML(response["response"])
     items = html.xpath('//div[@class="house-detail"]/ul/li')
     next_items = html.xpath('//div[@class="pages-box clearfix"]/div[2]/a[@class="turnpage_next"]')
     if next_items:
         if "下一页" in next_items[0].xpath('span/text()')[0]:
             next_page = next_items[0].xpath('@href')
             log.log(str(next_page),"debug")
             if next_page:
                 zone_base_url = response["zone_base_url"]
                 next_link = urljoin(zone_base_url,next_page[0])
                 log.log(next_link,"info")
                 task_queue.add_task(Sin_Req(next_link,self.parse_details,meta={"city_name":city_name,"zone_name":zone_name,"zone_base_url":zone_base_url}))
     if items:
         for i in items:
             data = {}
             title = i.xpath('div[1]/p[@class="house-title"]/a/@title')
             link = i.xpath('a/@href')
             base_info = i.xpath('div[1]/p[@class="house-about clearfix"]/span/text()')
             address = i.xpath('div[1]/p[@class="house-address clearfix"]/span[@class="whole-line"]/a/text()')
             price = i.xpath('div[@class="show-price"]/span[@class="sale-price"]/text()')
             uni_price = i.xpath('div[@class="show-price"]/p/text()')
             data["title"] = title
             data["link"] = link
             data["base_info"] = base_info
             data["address"] = address
             data["price"] = price
             data["uni_price"] = uni_price
             log.log(str(data),"info")
             mon.insert("data",data)
コード例 #10
0
 def start_req(self):
     offset = 0
     task = Sin_Req(self.base_url.format(offset * 10),
                    callback=self.parse_follow_lst,
                    meta={"offset": offset})
     task.headers.update(self.add_headers)
     task_queue.add_task(task)
コード例 #11
0
 def parse_item(self, response):
     """
     二手车列表页的抓取
     :param response:
     :return:
     """
     page = response["page"]
     cur_link = response["cur_link"]
     series = response["series"]
     car_name = response["car_name"]
     try:
         req_data = json.loads(response["response"])
         if req_data["data"]["thisCity"]:
             data = dict()
             data["car_name"] = car_name
             data["item_info"] = req_data["data"]["thisCity"]
             task_queue.old_task(cur_link)
             log.log(str(data), "info")
             mon.insert("item_data", data)
             next_link = self.base_items_link.format(series=series,
                                                     page=page + 1)
             log.log("下一页连接    " + next_link, "debug")
             next_task = Sin_Req(next_link,
                                 self.parse_item,
                                 meta={
                                     "cur_link": cur_link,
                                     "car_name": car_name,
                                     "page": page,
                                     "series": series
                                 })
     except Exception as e:
         log.log(str(e), "error")
コード例 #12
0
 def start_req(self):
     for k in self.task.keys():
         city = k
         link = self.task[city]
         task_queue.add_task(
             Sin_Req(link + r'sale/', self.parse_items, meta={"city":
                                                              city}))
コード例 #13
0
 def parse_items(self, response):
     html = etree.HTML(response["response"])
     city_name = response["city_name"]
     items = html.xpath('//li[@class="con caritem conHeight"]')
     next_item = html.xpath('//div[@class="con-page search_page_link"]/a')
     log.log("下一页条目   " + str(next_item), "debug")
     for j in items:
         data = dict()
         title = j.xpath('@data-title')
         car_id = j.xpath('@data-carid')
         price = j.xpath('@data-price')
         link = j.xpath('div[@class="across"]/a/@href')
         city_id = j.xpath('div[@class="across"]/a/@data-cityid')
         brand_id = j.xpath('div[@class="across"]/a/@data-brandid')
         seriesid = j.xpath('div[@class="across"]/a/@data-seriesid')
         year_age = j.xpath(
             'div[@class="across"]/div[@class="pad"]/span[1]/text()')
         data["title"] = title
         data["car_id"] = car_id
         data["price"] = price
         data["link"] = link
         data["city_id"] = city_id
         data["city_name"] = city_name
         data["brand_id"] = brand_id
         data["series_id"] = seriesid
         data["year"] = year_age[0]
         data["age"] = year_age[1]
         log.log(str(data), "info")
         mon.insert("item_data", data)
         if link:
             if car_id:
                 report_task = Sin_Req(self.base_report + car_id[0],
                                       self.parse_report,
                                       meta={"car_id": car_id})
                 report_task.headers["Referer"] = urljoin(
                     self.base_url, link[0])
                 task_queue.add_task(report_task)
     if next_item:
         next_page = next_item[-1].xpath('text()')
         if next_page:
             if "下一页" in next_page[0]:
                 next_link = next_item[-1].xpath('@href')[0]
                 log.log("下一页   " + next_link, "debug")
                 task_queue.add_task(
                     Sin_Req(urljoin(self.base_url, next_link),
                             callback=self.parse_items,
                             meta={"city_name": city_name}))
コード例 #14
0
 def start_req(self):
     """
     依据查询关键字形成初始请求
     :return:
     """
     for k in KEYS:
         start_task = Sin_Req(self.base_url + quote(k), self.parse_lst)
         task_queue.add_task(start_task)
コード例 #15
0
 def start_req(self):
     """
     任务初始化
     :return: 把初始种子url封装成Request对象放进任务队列
     """
     for k in self.task.keys():
         city = k
         link = self.task[city].replace(r'anjuke',r'zu.anjuke')
         task_queue.add_task(Sin_Req(link,self.parse_items,meta={"city":city}))
コード例 #16
0
 def start_req(self):
     for i in self.start_urls:
         start_page = 1
         task_queue.add_task(
             Sin_Req(i + "p" + str(start_page) + r"/",
                     callback=self.parse_item,
                     meta={
                         "now_page": start_page,
                         "base_link": i
                     }))
コード例 #17
0
 def start_req(self):
     """
     任务的初始化
     :return:把初始任务url封装成Request放入任务队列
     """
     for k in self.task.keys():
         city = k
         link = self.task[city]
         task_queue.add_task(
             Sin_Req(link + r'sale/', self.parse_items, meta={"city":
                                                              city}))
コード例 #18
0
 def start_req(self):
     """
     初始化任务队列
     :return:
     """
     for k in self.cars.keys():
         car_name = k
         car_base_link = self.cars[k]
         task_queue.add_task(
             Sin_Req(car_base_link,
                     self.parse_items,
                     meta={"car": car_name}))
コード例 #19
0
 def article_start_req(self):
     """
     构造动态加载的文章列表数据请求
     :return:
     """
     add_headers = {
         'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
         'referer': 'https://www.zhihu.com/org/shu-ju-bing-shan/activities'
     }
     s = Sin_Req(self.article_start_url,callback=self.parse_arti_data,meta={"add_headers":add_headers})
     s.headers.update(add_headers)
     task_queue.add_task(s)
コード例 #20
0
 def parse_lst(self, response):
     """
     列表页解析出详情页url,并解析出下一页的请求
     :param response:
     :return:
     """
     html = etree.HTML(response["response"])
     items = html.xpath('//div[@id="infolist"]/dl')
     for i in items:
         detail_url = i.xpath('dt[@class="w325"]/a/@href')
         log.log("详情页    " + detail_url[0], "debug")
         cleaned_detail_url = "https://" + detail_url[0][2:]
         detail_task = Sin_Req(
             cleaned_detail_url.replace('single', "singles").replace(
                 r'/?psid', '?psid'), self.parse_detail)
         # print("https://"+detail_url[0][2:])
         task_queue.add_task(detail_task)
     next_page = html.xpath('//a[@class="next"]/@href')
     if next_page:
         log.log("下一页的链接   " + next_page[0], "debug")
         next_task = Sin_Req(next_page[0], self.parse_lst)
         task_queue.add_task(next_task)
コード例 #21
0
 def start_req(self):
     """
     对每个车型的首页列表页进行初始化请求
     :return:
     """
     r = requests.get("https://www.xin.com/apis/Ajax_common/get_home_city/?cityid=201")
     data = json.loads(r.text)
     for i in data["data"]["city_all"]:
         city = data["data"]["city_all"][i]["ename"]
         city_name =data["data"]["city_all"][i]["cityname"]
         for i in self.cars:
             start_link = urljoin(self.base_url,city+r'/'+i)
             log.log(start_link,"debug")
             task_queue.add_task(Sin_Req(start_link,callback=self.parse_items,meta = {"city_name":city_name}))
コード例 #22
0
 def parse_start(self,response):
     html = etree.HTML(response["response"])
     items = html.xpath('//ul[@class="cities-opts clearfix"]/li')
     log.log(str(items), "debug")
     for i in items:
         a_items = i.xpath('p/a[@class="highlight"]')
         log.log(str(a_items), "debug")
         if a_items:
             for j in a_items:
                 data = dict()
                 name = j.xpath('text()')[0]
                 link = j.xpath('@href')[0]
                 data["city_name"] = name
                 data["city_link"] = link
                 mon.insert("city_data",data)
                 log.log(str(data)+"  ","info")
                 task_queue.add_task(Sin_Req(url="https:"+link,callback=self.parse_zones,meta={"city_link":link,"city_name":name}))
コード例 #23
0
 def parse_items(self, response):
     """
     解析出列表页的数据
     :param response:来自列表页的响应数据
     :return:对存在下一页的连接重新封装成放入待爬取队列,解析出的数据持久化至MongoDB
     """
     html = etree.HTML(response["response"])
     city = response["city"]
     item_data = html.xpath('//ul[@id="houselist-mod-new"]/li')
     next_item = html.xpath('//div[@class="multi-page"]/a')
     if next_item:
         if "下一页" in next_item[-1].xpath('text()')[0]:
             next_url = next_item[-1].xpath('@href')
             if next_url:
                 log.log("下一页链接   " + next_url[0], "debug")
                 next_task = Sin_Req(next_url[0],
                                     self.parse_items,
                                     meta={"city": city})
                 task_queue.add_task(next_task)
     if item_data:
         for i in item_data:
             item = dict()
             item["city"] = city
             item["title"] = i.xpath(
                 'div[@class="house-details"]/div[1]/a/@title')
             item["link"] = i.xpath(
                 'div[@class="house-details"]/div[1]/a/@href')
             item["room_style"] = i.xpath(
                 'div[@class="house-details"]/div[2]/span[1]/text()')
             item["size"] = i.xpath(
                 'div[@class="house-details"]/div[2]/span[2]/text()')
             item["price"] = i.xpath(
                 'div[@class="pro-price"]/span[@class="price-det"]/strong/text()'
             )
             item["uni_price"] = i.xpath(
                 'div[@class="pro-price"]/span[@class="unit-price"]/text()')
             item["year"] = i.xpath(
                 'div[@class="house-details"]/div[2]/span')[-2].xpath(
                     'text()')
             item["name_location"] = i.xpath(
                 'div[@class="house-details"]/div[3]/span/@title')
             log.log(str(item), "info")
             mon.insert("ajk_ershou", item)
コード例 #24
0
 def parse_comment(self,response):
     """
     解析每个文章的评论内容
     :param response:
     :return:
     """
     page = response["page"]
     article_id = response["article_id"]
     add_headers = response["add_headers"]
     try:
         data = json.loads(response["response"])
         if "paging" in data:
             if not data["paging"]["is_end"]:
                 log.log(str(data),"info")
                 mon.insert("comment_data",data)
                 next_comment_task = Sin_Req(self.comment_base_url.format(id = article_id,page = 20*(page+1)),callback=self.parse_comment,meta={"add_headers":add_headers,"page":page+1,"article_id":article_id})
                 next_comment_task.headers.update(add_headers)
                 task_queue.add_task(next_comment_task)
             else:
                 log.log("Last Comment Data"+str(data),"info")
                 mon.insert("comment_data",data)
     except Exception as e:
         log.log(str(e),"error")
コード例 #25
0
 def start_req(self):
     for i in range(22):  #22
         task_link = self.base_url.format(i * 12)
         task_queue.add_task(Sin_Req(task_link, self.parse_items))
コード例 #26
0
 def start_req(self):
     for k in self.task.keys():
         city = k
         link = self.task[city].replace(r'anjuke', r'zu.anjuke')
         task_queue.add_task(
             Sin_Req(link, self.parse_items, meta={"city": city}))
コード例 #27
0
ファイル: test.py プロジェクト: wudangqibujie/Spider_Project
from m_queue import TaskQueue
from lxml import etree

tt = TaskQueue()
from req import Sin_Req
url = "https://sz.lianjia.com/ershoufang/pg2/"


def A(response):
    html = etree.HTML(response["response"])
    title = html.xpath('//title/text()')
    print(title)


rr = Sin_Req(url=url, callback=A)
import pickle
tt.add_task(rr)
a = tt.pop_task()
a.get()
コード例 #28
0
 def start_req(self):
     s = Sin_Req(self.start_urls, self.parse_start)
     s.get()