Example #1
0
    def parse_content(self, response):
        """
            解析文章内容
        """
        log.info(('Begin parseContent ' + response.url))

        item = MerchantItem()

        item['updated_at'] = int(time.time())
        item['url'] = response.url
        item['area'] = response.url.split('/')[4]
        item['merchant_id'] = response.url.split('/')[-2]

        try:
            item['merchant_name'] = response.xpath(
                '//span[@id="shop_name_val"]/text()')[0].extract()
            item['company_profile'] = response.xpath(
                '//div[@class="i-txt"]/span[@class="s-con"]/text()'
            )[0].extract()

            item['service_area'] = \
                response.xpath('//div[@class="des"]/div[@class="item-des clearfix"]/div[@class="i-txt i-dTxt"]/text()')[
                    0].extract()

            item['merchant_pic'] = response.urljoin(
                response.xpath('//div[@class="pic"]/img/@src')[0].extract())
            yield item
        except Exception as e:
            # log.warn("-----------------------获取到内容:" + response.text + "------------------------------")
            log.warn("spider error %s ( refer: %s )" % (e, response.url))
            log.error(e)
            if configs.USE_PROXY:
                proxy_pool.add_failed_time(response.meta['proxy'].replace(
                    'http://', ''))
def get_client(host: str, port: int) -> MongoClient:
    try:
        client = MongoClient(host, port, maxPoolSize=MAX_POOL_SIZE)
        log.info("Connected successfully!!!")
        return client
    except errors.ConnectionFailure as e:
        log.error(e)
Example #3
0
 def process_request(self, request, spider):
     try:
         log.info("==== proxy = " + proxy_pool.random_choice_proxy() +
                  "  ====")
         request.meta[
             'proxy'] = "http://%s" % proxy_pool.random_choice_proxy()
     except Exception as e:
         log.error(e)
Example #4
0
 def parse_list(self, response):
     selector = Selector(response)
     items_selector = selector.xpath('//div[@id="listITme"]//div[@class="gl-listItem"]')
     for item_selector in items_selector:
         id = item_selector.xpath('a/@href').extract()[0].replace('/strategy/', '')
         # http://guju.com.cn/strategy/strategy_getStrategyInfo_ajax?strategyModel.id=4498
         next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + '/strategy/strategy_getStrategyInfo_ajax?strategyModel.id={id}').format(
             id=id)
         if self.design_strategy_service.is_duplicate_url(next_url):
             log.info("=================过滤了" + next_url + "===========")
             continue
         yield scrapy.Request(next_url, self.parse_content, meta={'id': id})
Example #5
0
    def handle_item(self, design_topic_item: DesignTopicItem):
        if self.is_duplicate_url(design_topic_item['html_url']):
            return
        design_topic_model = self.get_model(design_topic_item)
        self.save_to_database(self.collection, design_topic_model)
        self.insert_to_redis(design_topic_model.html_url)

        log.info(
            "========================================================================================="
        )
        log.info("html_url:" + design_topic_item['html_url'])
        log.info("title:" + design_topic_item['title'])
        log.info("description:" + design_topic_item['description'])
        log.info(
            "========================================================================================="
        )
Example #6
0
 def process_request(self, request, spider):
     try:
         log.info('Chrome driver begin...')
         self.driver.get(request.url)  # 获取网页链接内容
         return HtmlResponse(url=request.url,
                             body=self.driver.page_source,
                             request=request,
                             encoding='utf-8',
                             status=200)  # 返回HTML数据
     except TimeoutException:
         return HtmlResponse(url=request.url,
                             request=request,
                             encoding='utf-8',
                             status=500)
     finally:
         log.info('Chrome driver end...')
    def handle_item(self, item: CityItem):
        log.info('process item from worm url = ' + item['url'])

        if isinstance(item, CityItem):

            session = self.session()

            model = CityDO()
            model.name = item['name']
            model.url = item['url']
            model.create_at = item['create_at']

            try:
                m = session.query(CityDO).filter(
                    CityDO.url == model.url).first()

                if m is None:  # 插入数据
                    log.info('add model from worm url = ' + model.url)
                    session.add(model)
                    session.flush()
                    session.commit()
                    log.info('spider_success url = ' + model.url)

            except Exception as error:
                session.rollback()
                log.error(error)
                raise
            finally:
                session.close()
        return item
Example #8
0
    def parse(self, response):
        """
        解析文章列表页,拿到页面上的链接,给内容解析页使用,如果有下一页,则调用本身 parse()
        """
        log.info('Begin parse ' + response.url)

        list = response.xpath(
            '//div[@class="company-item"]//div[@class="ordinary clearfix"]')

        for index, merchant in enumerate(list):
            item = MerchantListItem()
            item['list_url'] = response.urljoin(
                merchant.xpath('./div[@class="list-middle fl"]/h2/a/@href').
                extract_first())
            item['category_name'] = merchant.xpath(
                './div[@class="list-middle fl"]/h2/a/text()').extract_first()
            item['merchant_id'] = merchant.xpath(
                './div[@class="list-right fl"]/a/@shop_id').extract_first()

            log.info('a href = ' + item['list_url'])
            yield Request(url=item['list_url'], callback=self.parse_content)

        ## 是否还有下一页,如果有的话,则继续
        pages = response.xpath('//div[@class="p_page"]/a')
        cur_page_xpath = response.xpath(
            '//div[@class="p_page"]/span[@class="cur"]/text()')
        if cur_page_xpath is None:
            cur_page_num = 1
        else:
            cur_page_num = int(cur_page_xpath.extract_first())
        if pages:
            cur_index = 0
            next_index = 0
            for index, page_list in enumerate(pages):
                page_num_str = (page_list.xpath('./text()').extract_first())
                if self.is_number(page_num_str):
                    page_num = int(page_num_str)
                    if page_num > cur_page_num:
                        next_index = index
                        break

            if next_index < len(pages):
                next_page_url = response.urljoin(
                    pages[next_index].xpath('./@href').extract_first())
                log.info('next_page_url: ' + next_page_url)
                # 将 「下一页」的链接传递给自身,并重新分析
                yield scrapy.Request(next_page_url, callback=self.parse)
Example #9
0
    def handle_item(self, item: MerchantItem):
        log.info('process item from worm url = ' + item['url'])

        if isinstance(item, MerchantItem):

            session = self.session()

            model = MerchantDO()
            model.updated_at = item['updated_at']
            model.merchant_name = item['merchant_name']
            model.company_profile = item['company_profile']
            model.service_area = item['service_area']
            model.merchant_pic = item['merchant_pic']
            model.merchant_id = item['merchant_id']
            model.url = item['url']
            model.area = item['area']

            try:
                m = session.query(MerchantDO).filter(
                    MerchantDO.url == model.url).first()

                if m is None:  # 插入数据
                    log.info('add model from worm url = ' + model.url)
                    session.add(model)
                    session.flush()
                    session.commit()
                    log.info('spider_success url = ' + model.url)

                # else:  # 更新数据
                #    log.info("update model from gp url " + model.url)
                #    m.updated_at = item['updated_at']
                #    m.merchant_name = item['merchant_name']
                #    m.merchant_pic = item['merchant_pic']
                #    m.service_area = item['service_area']
                #    m.merchant_id = item['merchant_id']
                #    m.company_profile = item['company_profile']
            #     m.area = item['area']
            #    m.url = item['url']

            except Exception as error:
                session.rollback()
                log.error(error)
                raise
            finally:
                session.close()
        return item
Example #10
0
    def parse(self, response):
        """
        解析内容
        """
        log.info('Begin parse ' + response.url)

        list = response.xpath(
            '//div[@class="city_main"]/dl[@class="clearfix"]/dd/a')

        for index, city in enumerate(list):
            item = CityItem()
            item['url'] = response.urljoin(
                city.xpath('./@href').extract_first())
            item['name'] = city.xpath('./text()').extract_first()
            item['create_at'] = int(time.time())
            log.info('a href = ' + item['url'])
            yield item

        log.info('End parse ' + response.url)
Example #11
0
    def handle_item(self, design_picture_item: DesignPictureItem):
        if self.is_duplicate_url(design_picture_item['img_url']):
            return
        design_picture_model = self.get_design_picture_model(
            design_picture_item)
        self.save_to_database(self.collection, design_picture_model)

        summary_model = self.find_one(self.summary_collection,
                                      {'id': design_picture_model.fid})
        if summary_model is None:
            summary_model = self.create_design_picture_summary_model(
                design_picture_model)
            self.save_to_database(self.summary_collection, summary_model)
        else:
            tags = list(
                set(summary_model['tags']).union(set(
                    design_picture_model.tags)))
            summary_model['cid'].append(design_picture_model.id)
            self.update_one(self.summary_collection,
                            {'id': summary_model['id']}, {
                                'update_time': utils.get_utc_time(),
                                'tags': tags,
                                'cid': summary_model['cid']
                            })
        self.insert_to_redis(design_picture_model.img_url)

        log.info(
            "========================================================================================="
        )
        log.info("title:" + design_picture_item['title'])
        log.info("sub_title:" + design_picture_item['sub_title'])
        log.info("original_width:" + design_picture_item['img_width'])
        log.info("original_height:" + design_picture_item['img_height'])
        log.info("html_url:" + design_picture_item['html_url'])
        log.info("img_url:" + design_picture_item['img_url'])
        log.info("description:" + design_picture_item['description'])
        log.info("tags:%s" % ','.join(map(str, design_picture_item['tags'])))
        log.info(
            "========================================================================================="
        )