コード例 #1
0
    def parse(self, response):
        try:
            json_obj = json.loads(response.text())
            results = json_obj["results"]["reviews"]
            page_id = json_obj["results"]["page_id"]

            if len(results) > 0:
                with open("reviews.json", "a+") as f:
                    for review in results:
                        review["page_id"] = page_id
                        f.write(json.dumps(review) + "\n")

            if "next_page_url" in json_obj["paging"]:
                next_page_url = json_obj["paging"]["next_page_url"]
                if len(next_page_url) > 0:
                    logger.info("next paging: %s", next_page_url)
                    # yield scrapy.Request(url=next_page_url,
                    #                      headers=self.headers,
                    #                      callback=self.parse)

            return

        except Exception as e:
            logger.exception("Error to parse review: %s", e)
        pass
コード例 #2
0
    def parse_company_page(self, response):
        item = HuicongItem()
        item["companyName"] = response.meta["company_name"]
        item["companyUrl"] = response.meta['company_url']

        soup = BeautifulSoup(response.body)
        detail_info = soup.select(
            'div[class^="contentbox"] div[class^="detailsinfo"]')
        self.get_company_info(detail_info, item)

        try:
            item['memberInfo'] = re.sub(
                '[\t\n\r]', '',
                soup.select('div[class^="contentbox"] div[class^="memyear"]')
                [0].find("span").text.strip())
        except:
            item['memberInfo'], item['MyeeigIndex'], item['merchantGrade'] = [
                "", "", ""
            ]
        else:
            if u"买卖通会员" not in item['memberInfo']:
                item['MyeeigIndex'] = ""
            else:
                item['MyeeigIndex'] = soup.select(
                    'div[class^="contentbox"] div[class^="comInfogo"] span[class^="redbold14"]'
                )[0].a.text

            item['merchantGrade'] = soup.select(
                'div[class^="contentbox"] div[style^="color"] a[target^="_blank"]'
            )[0].img['src'].split('/')[-1].split('.')[0]

        item['contactPerson'], item['cellphone'], item['phone'], item[
            'fax'] = "", "", "", ""

        try:
            contact_info = soup.select(
                'div[class^="contentbox"] div[class^="contactbox"]')[0]
        except Exception as ex:
            logger.info(f"[SPIDERCONTACT] {response.url} {ex}")
        else:
            item['contactPerson'] = ''.join([
                info.text.strip()
                for info in contact_info.find_all("li")[0].find_all("span")
            ])
            contact_ways = contact_info.find_all("li")[2:-1]
            try:
                for contact_way in contact_ways:
                    if u'电话' in contact_way['title'].split(u":"):
                        item['phone'] = contact_way['title'].split(u":")[1]
                    if u'手机' in contact_way['title'].split(u":"):
                        item['cellphone'] = contact_way['title'].split(u":")[1]
                    if u'传真' in contact_way['title'].split(u":"):
                        item['fax'] = contact_way['title'].split(u":")[1]
            except Exception as ex:
                logger.info(f"[SPIDERCONTACTWAY] {response.url} {ex}")

        item['collctTime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                           time.localtime(time.time()))

        return item
コード例 #3
0
ファイル: pipelines.py プロジェクト: SatoKoi/Python
    def process_item(self, item, spider):
        if isinstance(item, BiqugeItem):
            flag = 1
            name = item['name']
            author = item['book_author']
            _id = item['book_id']
            url = item['book_url']
            category = item['category']
            status = item['status']
            ret = Sql.check_book_id(_id)
            if ret != 1:
                Sql.insert_message(name, author, category, status, url, _id, flag=flag)
            else:
                logger.info('该书 {} 已存在数据库'.format(name))

        if isinstance(item, BookItem):
            flag = 2
            num = item['num']
            chapter_name = item['chapter_name']
            chapter_url = item['chapter_url']
            chapter_id = item['chapter_id']
            book_id = item['book_id']
            book_name = item['book_name']
            chapter_content = item['chapter_content']
            ret = Sql.check_chapter_id(chapter_id)
            if ret != 1:
                Sql.insert_message(book_name, book_id, chapter_name, chapter_id, chapter_url, chapter_content, flag=flag)
            else:
                logger.info('该章节 {} {} 已存在数据库'.format(book_name, chapter_name))
        return item
コード例 #4
0
 def parse_login(self, response):
     info = json.loads(response.text)
     if info['error'] == '0':
         logger.info('登录成功:-)')
         return super().start_requests()
     logger.info('登录失败:-(, 重新登录...')
     return self.start_requests()
コード例 #5
0
 def __init__(self):
     with open("data/list_user_agent.txt") as f:
         self.user_agent_list = []
         for line in f:
             self.user_agent_list.append(line.strip().replace(",", ""))
     self.number_of_requests_interval = 100
     logger.info("Enable to get randon user-agent")
コード例 #6
0
ファイル: Zaker_spider.py プロジェクト: lngbll/JOVI
 def get_content(self, response):
     meta = response.meta
     item = JoviLonglasttimeItem()
     item['article_url'] = response.url
     item['first_tag'] = meta['first_tag']
     item['second_tag'] = meta['second_tag']
     host = urlparse(response.url).netloc
     xpath = self.xpath.get(host)
     if xpath:
         item['article_title'] = response.xpath(
             xpath['title']).get().strip()
         ps = response.xpath(xpath['ps']).getall()
     else:
         logger.info('This URL parsing xpath is not settled:{}'.format(
             response.url))
         print(response.url)
         return
     content = ''
     for p in ps:
         if re.search(
                 r'责任编辑:|作者:|出处:|{}|来自:|来源 :|来源:|来源 : |图片来自|图片由|图:|更多精彩|请投稿至:|文|文/|编辑',
                 p):
             continue
         elif re.search(r'关注微信公众号|参考资料|声明:|原网页已经由 ZAKER 转码排版 |推荐阅读', p):
             break
         else:
             content += p.strip()
     item['article_content'] = content.replace('\n', '').replace(
         '\r', '').replace('\t', '').replace('\u3000',
                                             '').replace('\xa0', '')
     yield item
コード例 #7
0
ファイル: pipelines.py プロジェクト: MeiYu7/JD
 def close_spider(self, spider):
     if len(self.goods_items) > 0:
         self.insert_many_goods()
     self.client.close()
     print("最终插入{}条".format(self.insert_num))
     self.stats.set_value("finaly_insert_item", self.insert_num)
     logger.info("最终插入{}条".format(self.insert_num))
コード例 #8
0
    def parse(self, response):
        with open('./detail.txt', 'wb') as f:
            f.write(response.body)
            f.close()
        itemdetail = TesterhomeDetailSpiderItem()
        itemdetail['topic_id'] = response.xpath('//a[contains(@class, "qrcode")]/@data-url').extract()[0].split('/')[-1]
        itemdetail['topic_title'] = response.xpath('//div[contains(@class, "media-body")]/h1/text()').extract()[0]
        topic_body = ''
        for i in response.xpath('//div[contains(@class, "panel-body markdown")]/article/p/text()').extract():
            topic_body += i
        # topic_body
        itemdetail['topic_body'] = topic_body
        itemdetail['topic_author'] = response.xpath('//a[contains(@data-author, "true")]/@data-name').extract()[0]
        itemdetail['topic_like_num'] = response.xpath('//a[contains(@class, "likeable")]/@data-count').extract()[0]
        itemdetail['topic_reply_num'] = response.xpath('//div[contains(@class, "total panel-heading")]/b/text()').extract()[0]
        itemdetail['topic_timeago'] = response.xpath('//abbr[contains(@class, "timeago")]/@title').extract()[0]
        yield itemdetail

        # for sel in response.xpath('//div[contains(@class, "infos")]'):
        #     itemreplydetail = TesterhomeDetailReplySpiderItem()
        #     itemreplydetail['topic_reply_author'] = sel.xpath('div[contains(@class, info)]/span[contains(@class, "name")]/a/@data-name').extract()[0]
        #     itemreplydetail['topic_reply_timeago'] = sel.xpath('div[contains(@class, info)]/span[contains(@class, "time")]/abbr/@title').extract()[0]
        #     itemreplydetail['topic_reply_like_num'] = sel.xpath('div[contains(@class, info)]/span[contains(@class, "opts pull-right")]/a[contains(@class, "likeable")]/@data-count').extract()[0]
        #     for i in sel.xpath('div[contains(@class, markdown)]/p/text()').extract():
        #         itemreplydetail['topic_reply_author'] += i
        #     yield itemreplydetail

        if int(itemdetail['topic_id']) < 100:
            yield Request('http://testerhome.com/topics/'+str(int(itemdetail['topic_id'])+1), callback=self.parse)
            # self.start_urls.append('http://testerhome.com/topics/'+str(int(itemdetail['topic_id'])+1))
        else:
            logger.info('topic_id > 100')
コード例 #9
0
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(ListeningKafkaSpider,
                       cls).from_crawler(crawler, *args, **kwargs)

        if not hasattr(spider, 'topic') or not spider.topic:
            spider.topic = '%s-starturls' % spider.name

        hosts = crawler.settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092')
        consumer_group = crawler.settings.get(
            'SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
        _kafka = SimpleClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        spider.consumer = SimpleConsumer(_kafka,
                                         consumer_group,
                                         spider.topic,
                                         auto_commit=True,
                                         iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
        crawler.signals.connect(spider.item_scraped,
                                signal=signals.item_scraped)
        logger.info("Reading URLs from kafka topic '%s'" % spider.kafka_topic)

        return spider
コード例 #10
0
    def process_request(self, request, spider):

        print("**************ProxyMiddleware have pass************" +
              self.proxy)
        request.meta['proxy'] = "http://" + self.proxy
        request.meta['handle_httpstatus_list'] = [200]
        logger.info("process_request eval! %s" % request.meta)
コード例 #11
0
    def setup_kafka(self, settings):
        """Setup redis connection and idle signal.
        This should be called after the spider has set its crawler object.
        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name

        hosts = settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092')
        consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP',
                                      'scrapy-kafka')
        _kafka = SimpleClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        self.consumer = SimpleConsumer(_kafka,
                                       consumer_group,
                                       self.topic,
                                       auto_commit=True,
                                       iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        self.crawler.signals.connect(self.spider_idle,
                                     signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped,
                                     signal=signals.item_scraped)
        logger.info("Reading URLs from kafka topic '%s'" % self.kafka_topic)
コード例 #12
0
ファイル: pipelines.py プロジェクト: rory7/ugcVgtime
    def parse_artclieItem(self, item, spider):
        article = dict()
        title = join(item['title']).replace(" ", "")
        auther = join(item['auther']).replace(" ", "")
        source = join(item['source']).replace(" ", "")
        catalog = join(item['catalog']).replace(" ", "")
        art_url = join(item['artUrl']).replace(" ", "")
        create_time = item['createTime']
        artTime = join(item['artTime']).replace(" ", "")
        artFromUrl = join(item['artFromUrl']).replace(" ", "")
        artImageUrl = join(item['artImageUrl']).replace(" ", "")
        conList = item['content']
        # allhtml = join(item['xpathTag'])

        # artInfo头部文章标识信息
        article['artInfo'] = {
            'catalog': catalog,
            'title': title,
            'source': source,
            'art_url': art_url,
            'auther': auther,
            'create_time,': create_time,
            'artTime': artTime,
            'artFromUrl': artFromUrl,
            'artImageUrl': artImageUrl
        }

        # artTypeInfo 文章内容成分
        isString = item['isString']
        isImage = item['isImage']
        isVideo = item['isVideo']

        article['artTypeInfo'] = {
            'isString': isString,
            'isImage': isImage,
            'isVideo': isVideo
        }

        # artContent 文章具体内容list
        article['artContent'] = {'content': conList}

        logger.info("pipelines->parse_artclieItem=" + article.__str__() +
                    "\n itemlen=" + article.__len__().__str__())

        print "pipelines->parse_artclieItem=" + article.__str__(
        ) + "\n itemlen=" + article.__len__().__str__()

        jsonOutPut = json.dumps(article, ensure_ascii=True, encoding='utf-8')

        logger.info("pipelines->parse_artclieItem=" + jsonOutPut +
                    "\n itemlen=" + article.__len__().__str__())

        # 输出路径 指定目录下->json文件
        fileName = "vgtime/" + title + ".json"
        filePath = Constant.jsonOutPath + "/" + fileName
        jsonFile = open(filePath, 'wb')
        jsonOutPut.replace(" ", "")
        jsonFile.write(jsonOutPut.__str__())
        jsonFile.close()
コード例 #13
0
ファイル: login.py プロジェクト: eilinge/scrapy
 def parse_login(self, response):
     #pdb.set_trace()
     info = json.loads(response.text)
     if info['error'] == '0':
         logger.info('ok')
         return super().start_requests()
     logger.info('failed')
     return self.start_requests()
コード例 #14
0
 def parse_login(self, response):
     info = json.loads(response.text)
     if info['error'] == '0':
         logger.info('Login Success')
         return super().start_requests()
     
     logger.info('Login Fail')
     return self.start_requests()
コード例 #15
0
    def parse_login(self, response):
        # 根据响应结果判断是否登录成功
        info = json.loads(response.text)
        if info['error'] == '0':
            logger.info('登录成功')
            return super().start_requests()

        logger.info('登录失败')
        return self.start_requests()
コード例 #16
0
 def init(self):
     with codecs.open(CRAWLED_STORE_FILE_PATH, 'r') as file:
         for line in file.readlines():
             try:
                 link = json.loads(line, encoding='utf8')['companyUrl'].strip()
                 self.add(namespace_url(link))
             except Exception as ex:
                 logger.info(f"[URLFILTERERROR] {line} {ex}")
     logger.info(f"[URLFILTERINIT] {self.count}")
コード例 #17
0
ファイル: newhouse.py プロジェクト: khle08/Lianjia
 def start_requests(self):
     for city, url in all_city_map().items():
         page = get_new_house_page(url)
         self.city_url = url
         self.city = city
         logger.info('{} 一共有{} 页'.format(city, page))
         for i in range(1, page + 1):
             crawl_url = '{}/loupan/pg{}'.format(url, str(i))
             yield Request(crawl_url, self.parse, dont_filter=True)
コード例 #18
0
 def parse_login_ed(self, response):
     #需要判断是否登录成功,不成功则重新登录
     info = json.loads(
         response.text)  #form请求的正文是json串,包含了用户验证的结果,转为python字典后根据error字段判断
     if info['error'] == '0':
         logger.info('登录成功:-)')
         return super().start_requests()
     else:
         logger.info('登录失败:-( 重新登录...')
         return self.start_requests()
コード例 #19
0
ファイル: middlewares.py プロジェクト: kaiven11/crawl_project
 def process_request(self, request, spider):
     avaliable = [
         "http://186.249.71.237:20183",
         'http://39.104.55.229:8080',
     ]
     print("**************ProxyMiddleware have pass************" +
           self.proxy)
     request.meta['proxy'] = "http://" + self.proxy
     request.meta['download_timeout'] = 20
     request.meta['retry_times'] = 2
     logger.info("process_request eval! %s" % request.meta)
コード例 #20
0
    def process_item(self, item, spider):
        if isinstance(item, CommitItem):
            try:
                self.conn.commit()
            except Exception as e:
                logger.info('commit failed!')

        else:
            self.cursor.execute(
                self.sql, (item["bureau"], item["station"], item["name"],
                           item["address"], item["passenger"], item["luggage"],
                           item["package"], item["turn"]))
コード例 #21
0
 def parse_login(self, response):
     #巨坑!! 搞了两个小时,因为json外侧不能有" 号...
     response_text = response.text.replace("\"{", "{").replace("}\"",
                                                               "}").replace(
                                                                   "\\", "")
     result = json.loads(response_text)
     print(response_text)
     print("登录结果:" + str(result['isSuccess']))
     if result['isSuccess']:
         logger.info('登录成功')
         return super().start_requests()
     logger.info("登录失败,重新登录。")
     return self.start_requests()
コード例 #22
0
ファイル: pipelines.py プロジェクト: y503869692/mySpider
    def wrapper(self, item, spider):

        # message template for debugging
        msg = " {0} pipeline step".format(self.__class__.__name__)

        # if class is in the spider"s pipeline, then use the
        # process_item method normally.
        if self.__class__ in spider.pipeline:
            logger.info(msg.format("executing"))
            return process_item_method(self, item, spider)

        # otherwise, just return the untouched item (skip this step in
        # the pipeline)
        else:
            logger.info(msg.format("skipping"))
            return item
コード例 #23
0
ファイル: pipelines.py プロジェクト: qwertyI/daily_news
    def wrapper(self, item, spider):

        # message template for debugging
        msg = " {0} pipeline step".format(self.__class__.__name__)

        # if class is in the spider"s pipeline, then use the
        # process_item method normally.
        if self.__class__ in spider.pipeline:
            logger.info(msg.format("executing"))
            return process_item_method(self, item, spider)

        # otherwise, just return the untouched item (skip this step in
        # the pipeline)
        else:
            logger.info(msg.format("skipping"))
            return item
コード例 #24
0
ファイル: middlewares.py プロジェクト: ICDI0906/scrapy_test
    def process_request(self, request, spider):
        try:
            driver = webdriver.PhantomJS(
                service_args=['--disk-cache=true',
                              '--load-images=false'])  # 设置缓存和禁止图片加载
            wait = WebDriverWait(driver, 1)  # 等待1秒浏览器进行加载
            driver.set_window_size(100, 1200)
            if request.url.startswith("http://www.sohu.com"):
                other = request.url.split('depth')
                depth = int(other[1].split('=')[1])
                url = other[0]
                if depth == 1 or depth == 2:
                    try:
                        time.sleep(1)
                        driver.get(url)
                    except:
                        logger.info('出现异常1,2')
                    #chrome浏览器提速
                    # chrome_opt = webdriver.ChromeOptions()
                    # prefs = {"profile.managed_default_content_settings.images": 2}
                    # chrome_opt.add_experimental_option("prefs", prefs)
                    # #driver = webdriver.Chrome(service_args=['--disk-cache=true','--load-images=false'])
                    # driver = webdriver.Chrome( chrome_options=chrome_opt)
                    # driver.set_window_size(100, 1000)
                    # driver.get(request.url)
                    # js = "document.documentElement.scrollTop=document.documentElement.scrollHeight"  # 滚动条下拉1000px
                    # end = driver.find_elements_by_xpath("//div[@class='more-load' and @style='']")
                    # count = 1
                    # while not len(end):
                    #     print(count)
                    #     driver.execute_script(js)
                    #     time.sleep(1)
                    #     end = driver.find_elements_by_xpath("//div[@class='more-load' and @style='']")
                    #     count += 1
                    count = 0
                    while count < 50:
                        driver.execute_script(
                            "window.scrollTo(0, document.body.scrollHeight)")
                        count += 1
                        logger.info('正在加载')
                        time.sleep(0.5)
                    content = driver.page_source.encode('utf-8')
                    driver.close()
                    return HtmlResponse(request.url, body=content)
                else:
                    try:
                        time.sleep(1)
                        driver.get(url)
                    except:
                        logger.info('出现异常3')
                    content = driver.page_source.encode('utf-8')
                    driver.close()
                    return HtmlResponse(request.url, body=content)

        except:
            logger.info('出现异常')
コード例 #25
0
 def parse(self, response):
     t1 = time.time()
     html = scrapy.Selector(text=response.text)
     divs = html.css("#content_left  > div .f13 .c-tools::attr(data-tools)")
     for div in divs:
         data_str = div.extract()
         data_dict = json.loads(data_str)
         url = None
         try:
             url = requests.get(data_dict['url'], timeout=5).url
             schame = urllib.parse.urlparse(url).netloc
             sql = f"insert into seed(url,title,site_name,type) values('{url}','{data_dict['title']}','{schame}',1)"
             self.mysql.excute_sql(sql)
         except Exception as e:
             logger.error(
                 f"requests.get(data_dict['url']).url ===>>> {str(e)}")
     t2 = time.time()
     logger.info(f"执行===>>> {response.url} 花费时间{str(t2-t1)}")
コード例 #26
0
ファイル: honor.py プロジェクト: zm786955593/crawers
 def openlink(self, url):
     """
     urlopen error 10060错误
     :param url:  请求的网址
     :param headers: 报文头部信息
     :return: 服务器响应
     """
     maxTryNum = 15
     for tries in range(maxTryNum):
         try:
             logging.info("请求%s" % (url))
             req = requests.get(url, timeout=13, headers=self.headers)
             logging.info('请求成功%s' % url)
             return req
         except:
             if tries < (maxTryNum - 1):
                 continue
             else:
                 logger.info("尝试%d 次连接网址%s失败!" % (maxTryNum, url))
コード例 #27
0
 def parse(self, response):
     t1=time.time()
     html=scrapy.Selector(text=response.text)
     divs=html.css("div.results > div")
     for div in divs:
         vrwrap=div.css("div.vrwrap")
         if len(vrwrap)==0:
             title = "".join(div.css("div.rb h3 a::text").extract())
             url = "https://www.sogou.com" + div.css("div.rb h3 a::attr(href)").extract()[0]
         else:
             title="".join(div.css("div.vrwrap h3 a::text").extract())
             url = "https://www.sogou.com"+div.css("div.vrwrap h3 a::attr(href)").extract()[0]
         try:
             _html=scrapy.Selector(text=requests.get(url,verify=False).text)
             url = _html.re("window.location.replace\(\"(.*?)\"\)")[0]
             schame = urllib.parse.urlparse(url).netloc
             sql = f"insert into seed(url,title,site_name,type) values('{url}','{title}','{schame}',1)"
             self.mysql.excute_sql(sql)
         except Exception as e:
             logger.error(f"requests.get(data_dict['url']).url ===>>> {str(e)}")
         t2=time.time()
         logger.info(f"执行===>>> {response.url} 花费时间{str(t2-t1)}")
コード例 #28
0
ファイル: mainlv1.py プロジェクト: tienduynguyen318/jetcom
    def parse(self, response):
        urls = response.css("#__next > div")[1].css(
            "div.eWvQyF > div > div > div >a::attr(href)").extract()
        if len(urls) == 0:
            logger.info("Not found urls in menu bar")
            logger.info("Try to fetch on top categories")
            urls = response.css(
                "#__next  div.eIYDfA > div > div > a::attr(href)").extract()
            if len(urls) == 0:
                urls.append(response.url)

        with open("level2.txt", "a") as f:
            for url in urls:
                f.write(url + "\n")

        logger.info("Write %s urls to file", len(urls))
コード例 #29
0
    def parse_item(self,response):
        from spider_service.items import TBItem
        from spider_service.common.tools import LevelTool
        selector = scrapy.Selector(response)
        data_obj = TBItem()
        item_rsp_text = response.body_as_unicode()
        if response.status  in [302,301]:
            redirect_url = response.url
            if response.headers.get('Location') and 'login.taobao.com/jump?target=' in response.headers.get('Location') and 'login.taobao.com/jump' not in response.url:
                redirect_url = response.headers.get('Location')
            item_rsp = RequestHandle.get_rsp(redirect_url)
            item_rsp_text = item_rsp.text
            selector = scrapy.Selector(item_rsp)

        if 'detail.tmall.com/item.htm' in response.url or 'tmall.com' in response.url or True:
            title = selector.xpath('//h3[@class="tb-main-title"]/@data-title').extract_first()
            if not title:
                title = selector.xpath('//input[@name="title"]/@value').extract_first()
            if not title:
                logger.info('no find title ,response url:' + response.url)
                return
            #原始价格,促销价格获取经常需验证
            price = response.xpath("//em[@class='tb-rmb-num']/text()").extract_first()
            nick = selector.xpath('//div[@class="tb-shop-info-wrap"]/div/div[@class="tb-shop-seller"]//a[@class="tb-seller-name"]/text()').extract_first()
            nick_tmc = selector.xpath('//a[@class="slogo-shopname"]/strong/text()').extract_first()
            nick = nick_tmc if not nick and nick_tmc else nick
            shop_type ='B' if nick_tmc else 'C'
            if nick:
                nick = nick.replace('\n','').strip()
            else:
                #企业店铺店铺等级信息需要单独获取
                nick_list = re.findall('''[^\w]*sellerNick\s*:\s*['"](.+)['"]''',response.body)
                if not nick_list:
                    return
                nick = urllib.unquote_plus(nick_list[0]).decode('gbk') if nick_list else nick
            #比如用户是3皇冠,知道是3,但是不知道是皇冠还是钻,因为图片是在css属性中的background中
            level_type = selector.xpath('//div[@class="tb-shop-info-hd"]/div[2]/@class').extract_first()
            level_num = len(selector.xpath('//div[@class="tb-shop-info-hd"]/div[2]//i'))
            rate_url = selector.xpath('//div[@class="tb-shop-info-hd"]/div[2]//a/@href').extract_first()
            ##price_url like https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId=559466494297&sellerId=96216586&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,amountRestriction,couponActivity,soldQuantity,originalPrice,tradeContract
            price_url_list = re.findall('''[^\w]*wholeSibUrl\s*:\s*['"](.+)['"]''',response.body)
            count_url_list = re.findall('''[^\w]*counterApi\s*:\s*['"](.+)['"]''',response.body)
            #企业店铺延迟加载
            shop_info_url_list = re.findall('''[^\w]*api\s*:\s*['"](.+alicdn\.com/asyn\.htm.*)['"]''',response.body)
            user_id = selector.xpath('//div[@id="J_Pine"]/@data-sellerid').extract_first()
            #c店
            if shop_type == 'B':
                num_iid = selector.xpath('//div[@id="LineZing"]/@itemid').extract_first()
                sid = selector.xpath('//div[@id="LineZing"]/@shopid').extract_first()
                cid_str = selector.xpath('//div[@id="J_ZebraPriceDesc"]/@mdv-cfg').extract_first()
                m = re.findall('catId:(\d+)',cid_str)
                if not m:
                    return
                cid = m[0] if m else ''
                js_data = re.findall('<\s*script[^>]*>[^<]*TShop.Setup\(([^<]*?)\)[^<]*<\s*/\s*script\s*',item_rsp_text)
                js_data = json.loads(js_data[0]) if js_data else {}
                user_id = js_data['itemDO']['userId']
                cid = js_data['itemDO']['categoryId']
                shop_info_url_list = [js_data['api']['fetchDcUrl']]
                price = js_data['detail']['defaultItemPrice'] 
                count_url_list = [js_data['apiBeans']]

            else:
                num_iid = selector.xpath('//div[@id="J_Pine"]/@data-itemid').extract_first()
                sid = selector.xpath('//div[@id="J_Pine"]/@data-shopid').extract_first()
                cid = selector.xpath('//div[@id="J_Pine"]/@data-catid').extract_first()
            detail_common_url = 'https://rate.taobao.com/detailCommon.htm?auctionNumId=%s&userNumId=%s' %(num_iid,user_id) + '&ua=098%23E1hvopvUvbpvUpCkvvvvvjiPPLFpsjDCPFSwsjthPmPh6j3CP2ShljnCPLShlj3UR4wCvvpvvUmmmphvLCCwXQvjOezOafmAdcOdYExrt8g7EcqyaNoxdB%2BaWXxrzjZcR2xVI4mxfXAK4Z7xfa3l5dUf85xr1jZ7%2B3%2BuaNLXSfpAOHmQD7zydiTtvpvIvvvvvhCvvvvvvUnvphvUivvv96CvpC29vvm2phCvhhvvvUnUphvp98yCvv9vvUvQ0%2FCUhOyCvvOWvvVvaZUCvpvVvmvvvhCv2QhvCPMMvvvtvpvhvvvvvv%3D%3D&callback=json_tbc_rate_summary'
            detail_common_rsp = RequestHandle.get_json_rsp(detail_common_url)
            detail_common_dict = ResponseTool.unpack_jsonp(detail_common_rsp.text)
            if price_url_list:
                price_url = 'https:'+ price_url_list[0]
                #yield scrapy.Request(url=price_url,meta = {'data':doc,'cookiejar':1},callback = self.parse_price)
            count_dict = {}
            if count_url_list:
                count_url= 'https:'+count_url_list[0] + '&callback=jsonp109'
                count_rsp = RequestHandle.get_json_rsp(count_url)
                try:
                    count_dict = ResponseTool.unpack_jsonp(count_rsp.text)
                except Exception,e:
                    m_count = re.findall('ICCP_1_%s":(\d*)' % num_iid, count_rsp.text)
                    if m_count:
                        count_dict = {'ICCP_1_%s' % num_iid:int(m_count[0])}
            #企业店铺,店铺信息异步加载
            prom_price = None
            if 'tb-shop-info-wrap' not in response.body and title and nick and shop_info_url_list:
                shop_info_url = 'https:' + shop_info_url_list[0]
                shop_text = RequestHandle.get_rsp(shop_info_url).text.replace('\\r\\n','').replace('\\"','"').replace("\\'","'")
                m = re.findall('(<div class="tb-shop".*)',shop_text)
                if m:
                    page = lhtml.document_fromstring(m[0])
                #if shop_type == 'B':
                #    page = lhtml.document_fromstring(shop_text)
                #    prom_price_list = page.xpath('//p[@class="price"]/span/text()') 
                #    prom_price = prom_price_list[0] if prom_price_list else None
                if m:
                    level_num = len(page.xpath('//div[@class="shop-rank-wrap"]/span/a/i'))
                    level_type_obj = page.xpath('//div[@class="shop-rank-wrap"]/span/a') 
                    level_type = level_type_obj[0].get('class') if level_type_obj else level_type 
                page.xpath('//p[@class="price"]/span/text()')
            level = LevelTool.get_level(level_num,level_type)
            #print '===item==',nick,level_num,title,rate_url,price_url_list
            doc = {'nick':nick,'level':level,'item_title':title,'origin_price':price}
            if response.meta.get('data'):
                doc.update(response.meta.get('data'))
            doc.update({'user_id':int(user_id),'num_iid':int(num_iid),'sid':int(sid),'cid':int(cid),'common_detail':detail_common_dict,'count_detail':count_dict})
            data_obj.update(doc)
            yield data_obj
                
コード例 #30
0
 def process_request(self, request, spider):
     if self.user_agent:
         user_agent = self.get_random_ua()
         logger.info(user_agent)
         request.headers.setdefault(b'User-Agent', user_agent)
コード例 #31
0
    def parse(self, response):
        item = ArticleItem()
        sel = scrapy.Selector(response)

        catalog = ""
        title = join(sel.xpath("//h1[@class='art_tit']/text()").extract())
        source = "vgtime"
        art_url = ""  # 物理存放地址
        auther = join(
            sel.xpath("//div[@class='editor_name']/span[1]/text()").extract())
        create_time = time.altzone
        artTime = join(sel.xpath("//span[@class='time_box']/text()").extract())
        artFromUrl = response.url
        artImageUrl = ""  # 来list同级下的展示图片下载地址
        allhtml = response.body

        # content内容摘要信息
        isString = False
        isImage = False
        isVideo = False

        # 解析具体内容到 list中
        conList = []
        com = sel.xpath("//div[@class='topicContent front_content']/*")
        '''有序抓取'''
        for index, content in enumerate(com):  # 拿到所有内容
            # logger.info("spider!!!=" + content.extract() + index.__str__())

            pSel = "//div[@class='topicContent front_content']/p[" + (
                index + 1).__str__() + "]"
            imgMvSel = "//div[@class='topicContent front_content']/div[" + (
                index +
                1).__str__() + "]/figure"  # 图片和视频,根据figure后的标签判断是图片还是视频
            pCon = sel.xpath(pSel)
            ivCon = sel.xpath(imgMvSel)
            # logger.info("spider!!!=" + ivCon.extract().__str__() + index.__str__())

            if len(pCon.extract()) > 0:  # 文字
                pText = pCon.xpath('string(.)').extract()
                conList.append(join(pText))
                isString = True
                # logger.info("text!!!=" + pText.__str__() + index.__str__())

            if len(ivCon.extract()) > 0:  # 图片和视频
                imgCon = ivCon.xpath("img")
                mvCon = ivCon.xpath("embed")
                mvCon_iframe = ivCon.xpath("iframe")
                if len(imgCon) > 0:  # 图片
                    imgUrl = imgCon.xpath("@src").extract()
                    imgWidth = imgCon.xpath("@style").re("[1-9]\d*")  # px
                    type = Constant.typeImg
                    imgObj = {
                        "src": join(imgUrl),
                        "width": join(imgWidth),
                        "type": str(type)
                    }
                    conList.append(imgObj)
                    isImage = True
                    # logger.info("image_!!!=" + imgObj.__str__())
                    if len(ivCon.xpath("figcation")) > 0:  # 图片下方介绍文字
                        imgDes = ivCon.xpath("figcation/text()").extract()
                        conList.append(join(imgDes))
                        logger.info("image_!!!=" + imgDes)
                elif len(mvCon.extract()) > 0:  # 视频
                    mvUrl = mvCon.xpath("@src").extract()
                    mvParams = mvCon.xpath("@flashvars").extract()
                    mvUrl.append(mvParams)  # 这里是 url+params拼接
                    mvWidth = mvCon.xpath("@width").extract()
                    mvHeight = mvCon.xpath("@height").extract()
                    type = Constant.typeVideo
                    mvObj = {
                        "src": join(mvUrl),
                        "type": str(type),
                        "width": join(mvWidth),
                        "height": join(mvHeight)
                    }
                    conList.append(mvObj)
                    # logger.info("image_!!!=" + mvObj.__str__())
                elif len(mvCon_iframe.extract()) > 0:  # 视频-iframe
                    mvUrl = mvCon_iframe.xpath("@src").extract()
                    mvWidth = "-1"
                    mvHeight = "-1"
                    if len(mvCon_iframe.xpath("@width").extract()):
                        mvWidth = mvCon_iframe.xpath("@width").extract()
                    if len(mvCon_iframe.xpath("@height").extract()) > 0:
                        mvHeight = mvCon_iframe.xpath("@height").extract()
                    type = Constant.typeVideo
                    mvObj = {
                        "src": join(mvUrl),
                        "type": str(type),
                        "width": join(mvWidth),
                        "height": join(mvHeight)
                    }
                    conList.append(mvObj)
                    isVideo = True
                    # logger.info("image_!!!=" + mvObj.__str__())

        # logger.info("spider!!!=" + conList.__str__())
        # logger.info("spider!!!=" + conList.__len__().__str__())

        # 构造返回item
        item['title'] = title
        item['auther'] = auther
        item['source'] = source
        item['catalog'] = catalog
        item['artUrl'] = art_url
        item['createTime'] = create_time
        item['artTime'] = artTime
        item['artFromUrl'] = artFromUrl
        item['artImageUrl'] = artImageUrl
        item['content'] = conList
        # item['xpathTag'] = allhtml
        item['isString'] = str(isString)
        item['isImage'] = str(isImage)
        item['isVideo'] = str(isVideo)

        yield item
コード例 #32
0
ファイル: middlewares.py プロジェクト: kaiven11/crawl_project
 def process_exception(self, request, exception, spider):
     self.proxy = get_proxy()
     request.meta['proxy'] = "http://" + self.proxy
     logger.info("process_exception eval! %s" % request.meta)
     return request