Esempio n. 1
0
    def start_requests(self):
        keys = self.settings.get('KEYS')
        self.browser, self.cookies = register()  # 包含登录后获得的cookies的字典
        self.browser.get(self.base_url[0] + keys)  # 访问页面 (搜索“手机”关键字页面)
        self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        url_i = self.browser.current_url
        html = self.browser.page_source

        # 开始爬取
        print('-------start scrapy--------')
        yield scrapy.Request(url=self.base_url[0] + keys, headers=self.re_headers, cookies=self.cookies,
                             callback=self.parse,
                             meta={'html': html, 'i': 0, 'url': url_i})
Esempio n. 2
0
 def start_requests(self):
     keys = self.settings.get('KEYS')
     self.browser, list = register()
     self.browser.get(self.base_url[0] + keys)
     self.browser.execute_script(
         "window.scrollTo(0, document.body.scrollHeight)")
     url_i = self.browser.current_url
     html = self.browser.page_source
     yield scrapy.Request(url=self.base_url[0] + keys,
                          headers=self.re_headers,
                          cookies=list,
                          callback=self.parse,
                          meta={
                              'html': html,
                              'i': self.i,
                              'url': url_i
                          })
Esempio n. 3
0
 def start_requests(self):
     keys = self.settings.get('KEYS')
     self.browser, c_list = register()
     global cookies
     cookies = c_list
     self.cookies = c_list
     self.browser.get(self.base_url[0] + keys)
     self.browser.execute_script(
         "window.scrollTo(0, document.body.scrollHeight)")
     url_i = self.browser.current_url
     html = self.browser.page_source
     # yield SplashRequest(url=self.base_url[0],
     yield scrapy.Request(
         url=self.base_url[0],
         callback=self.parse,
         # args={'wait': '0.5', 'timeout': 3600},
         meta={
             'html': html,
             'i': self.i,
             'url': url_i
         },
         cookies=c_list,
         headers=self.re_headers)
Esempio n. 4
0
 def parse(self, response):
     time.sleep(10)
     html = response.meta.get('html')
     i = response.meta.get("i")
     url_i = response.meta.get("url")
     i += 1
     if i > 10:
         return
     try:
         soup = BeautifulSoup(html, 'html.parser')  #html解析器
         if i == 1:
             yield self.get_base_category(soup)
         lists = soup.select(
             '#mainsrp-itemlist > div > div > div > div')  #获取该页面所有商品列表
         for list in lists:
             url_detail = list.find(
                 'a',
                 attrs={'class': "pic-link J_ClickStat J_ItemPicA"},
                 href=not_item_taobao_com)
             if url_detail:
                 url_detail = 'https:' + url_detail.attrs.get('href')
                 #url_detail='https://detail.tmall.com/item.htm?id=598079959720&cm_id=140105335569ed55e27b&abbucket=7&sku_properties=10004:653780895;5919063:6536025'
                 self.browser.get(url_detail)  # 访问指定手机详情页面
                 self.browser.execute_script(
                     "window.scrollTo(0, document.body.scrollHeight)")
                 url_i_detail = self.browser.current_url
                 html_detail = self.browser.page_source
                 header = {
                     'user-agent':
                     'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
                 }
                 yield scrapy.Request(url=url_detail,
                                      headers=header,
                                      cookies=self.cookies,
                                      callback=self.parse_detail,
                                      meta={
                                          'html': html_detail,
                                          'url': url_i_detail
                                      },
                                      dont_filter=True)
         print('本页爬完。。。。。。!!!!!')
         button = self.browser.find_elements(
             By.XPATH, '//a[@class="J_Ajax num icon-tag"]')[-1]  #找到“下一页”按钮
         button.click()
         time.sleep(random.random() * 2)
         # 移动到页面最底部
         self.browser.execute_script(
             "window.scrollTo(0, document.body.scrollHeight)")
         url_i = self.browser.current_url
         html = self.browser.page_source  #获取网页源码
         yield scrapy.Request(url=response.url,
                              headers=self.re_headers,
                              callback=self.parse,
                              cookies=self.cookies,
                              meta={
                                  'html': html,
                                  'i': i,
                                  'url': url_i
                              },
                              dont_filter=True)
     except Exception as e:
         time.sleep(10)
         print(e)
         self.browser.close()
         self.browser, cookies = register()
         self.browser.get(url=url_i)
         time.sleep(random.random() * 2)
         self.browser.execute_script(
             "window.scrollTo(0, document.body.scrollHeight)")
         html = self.browser.page_source
         yield scrapy.Request(url=response.url,
                              headers=self.re_headers,
                              callback=self.parse,
                              cookies=cookies,
                              meta={
                                  'html': html,
                                  'i': i,
                                  'url': url_i
                              },
                              dont_filter=True)
Esempio n. 5
0
 def parse(self, response):
     time.sleep(5)
     html = response.meta.get('html')
     i = response.meta.get("i")
     url_i = response.meta.get("url")
     i += 1
     if i > 100:
         return
     try:
         soup = BeautifulSoup(html, 'html.parser')
         lists = soup.select('#mainsrp-itemlist > div > div > div > div')
         for list in lists:
             item = TaobaoSItem()
             url = list.select(
                 'a[class="pic-link J_ClickStat J_ItemPicA"]')[0].attrs.get(
                     'href', '')
             name = list.select(
                 "a[class='J_ClickStat']")[0].get_text().strip()
             name = data_cleaning(name)
             price = list.select(
                 'div[class="price g_price g_price-highlight"] strong'
             )[0].get_text()
             num = list.select('div[class="deal-cnt"]')[0].get_text()
             shop_name = list.select(
                 "a[class='shopname J_MouseEneterLeave J_ShopInfo']"
             )[0].get_text().strip()
             shop_name = data_cleaning(shop_name)
             item['url'] = url
             item['name'] = name
             item['price'] = price
             item['num'] = num
             item['shop_name'] = shop_name
             yield item
         button = self.browser.find_elements(
             By.XPATH, '//a[@class="J_Ajax num icon-tag"]')[-1]
         button.click()
         time.sleep(random.random() * 2)
         self.browser.execute_script(
             "window.scrollTo(0, document.body.scrollHeight)")
         html = self.browser.page_source
         yield scrapy.Request(url=response.url,
                              headers=self.re_headers,
                              callback=self.parse,
                              meta={
                                  'html': html,
                                  'i': i,
                                  'url': url_i
                              },
                              dont_filter=True)
     except Exception as e:
         time.sleep(10)
         print(e)
         self.browser.close()
         self.browser, list = register()
         self.browser.get(url=url_i)
         time.sleep(random.random() * 2)
         self.browser.execute_script(
             "window.scrollTo(0, document.body.scrollHeight)")
         html = self.browser.page_source
         yield scrapy.Request(url=response.url,
                              headers=self.re_headers,
                              callback=self.parse,
                              meta={
                                  'html': html,
                                  'i': i,
                                  'url': url_i
                              },
                              dont_filter=True)
Esempio n. 6
0
    def parse(self, response):

        time.sleep(1)
        html = response.meta.get('html')
        i = response.meta.get("i")
        url_i = response.meta.get("url")
        main_page_handler = self.browser.window_handles[0]

        # i +=1
        # 当搜索所有结果时改成 1 > 100
        if i > 100:
            return
        try:

            i += 1
            soup = BeautifulSoup(html, 'html.parser')
            products = soup.select('#mainsrp-itemlist > div > div > div > div')
            # n=0
            for product in products:
                # if n > 4:
                #     break
                # n += 1
                item = TaobaoSItem()
                url = "https:" + product.select(
                    'a[class="pic-link J_ClickStat J_ItemPicA"]')[0].attrs.get(
                        'href', '')
                if 'simba' in url:
                    continue
                try:
                    shop_name = product.select(
                        'a[class="shopname J_MouseEneterLeave J_ShopInfo"]'
                    )[0].get_text().split()[-1]
                except Exception as e:
                    print("@@@ shop name get failed.")
                    print(e)
                try:
                    name = product.select(
                        'a[class="J_ClickStat"]')[0].get_text().strip()
                except Exception as e:
                    print("@@@ name get failed.")
                    print(e)
                try:
                    price = product.select(
                        'div[class="price g_price g_price-highlight"]'
                    )[0].get_text().strip()[1:]
                except Exception as e:
                    print("@@@ price get failed.")
                    print(e)

                brand = None
                type = None
                model = None
                title = None

                self.browser.switch_to.window(main_page_handler)
                goods_page = 'window.open("{}");'.format(url)
                self.browser.execute_script(goods_page)
                time.sleep(3)

                for handle in self.browser.window_handles:
                    if handle != main_page_handler:
                        self.browser.switch_to.window(handle)
                self.browser.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight)")

                if 'tmall' in url:
                    mall_name = 'tmall'
                    try:
                        sales = self.browser.find_elements(
                            By.XPATH,
                            '//li[@class="tm-ind-item tm-ind-sellCount"]/*/span[@class="tm-count"]'
                        )[0].text
                    except Exception as e:
                        print("@@@ sales get failed.")
                        print(e)

                    for param in self.browser.find_elements(
                            By.XPATH, '//*[@id="J_AttrUL"]/li'):
                        text = param.text
                        if text.startswith(u'品牌:'):
                            brand = text
                        if text.startswith(u'型号:'):
                            model = text
                else:
                    mall_name = 'taobao'
                    try:
                        sales = self.browser.find_elements(
                            By.ID, 'J_SellCounter')[0].text
                    except Exception as e:
                        print("@@@ sales get failed.")
                        print(e)
                    for param in self.browser.find_elements(
                            By.XPATH, '//ul[@class="attributes-list"]/li'):
                        text = param.text
                        if text.startswith(u'品牌:'):
                            brand = text
                        if text.startswith(u'型号:'):
                            model = text

                self.browser.close()
                item['tmall'] = mall_name
                item['url'] = url
                item['name'] = name
                item['price'] = price
                item['shop_name'] = shop_name
                item['sales'] = sales
                item['title'] = title
                item['type'] = type
                item['brand'] = brand
                item['model'] = model

                yield item

            self.browser.switch_to.window(main_page_handler)
            self.c_url = self.browser.current_url
            time.sleep(4)
            button = self.browser.find_elements(
                By.XPATH, '//a[@class="J_Ajax num icon-tag"]')[-1]

            button.click()
            time.sleep(random.random() * 2)
            self.browser.execute_script(
                "window.scrollTo(0, document.body.scrollHeight)")
            html = self.browser.page_source
            # self.url_i = response.url
            yield scrapy.Request(
                url=self.c_url,
                headers=self.re_headers,
                callback=self.parse,
                meta={
                    'html': html,
                    'i': i,
                    'url': url_i
                },
                dont_filter=True,
                # args={'wait': '0.5', 'timeout': 3600},
                cookies=self.cookies)  #, errback=self.err)

        except Exception as e:
            time.sleep(2)
            print(e)
            self.browser.switch_to.window(main_page_handler)
            self.browser.close()
            self.browser, list = register()
            self.browser.get(url=response.url)
            time.sleep(random.random() * 2)
            self.browser.execute_script(
                "window.scrollTo(0, document.body.scrollHeight)")
            html = self.browser.page_source
            yield scrapy.Request(
                url=response.url,
                headers=self.re_headers,
                callback=self.parse,
                meta={
                    'html': html,
                    'i': i,
                    'url': url_i
                },
                dont_filter=True,
                # args={'wait': '0.5', 'timeout': 3600}
            )