Example #1
0
    def _next_request(self, spider):
        try:
            slot = self.slots[spider]
        except KeyError:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = slot.start_requests.next()
            except StopIteration:
                slot.start_requests = None
            except Exception, exc:
                log.err(None, 'Obtaining request from start requests', \
                        spider=spider)
            else:
                if request:
                    self.crawl(request, spider)
Example #2
0
    def __init__(self, fresh=False, nosentry=False, *args, **kwargs):
        super(TorrentsSpider, self).__init__(*args, **kwargs)

        try:
            cfg = ConfigParser()
            cfg.readfp(open('scrapy.properties'))
            
            
            self.db = pymongo.MongoClient(cfg.get('mongo','host'),int(cfg.get('mongo','port'))).torrents
            self.db_sources = pymongo.MongoClient(cfg.get('mongo','host_sources'),int(cfg.get('mongo','port_sources'))).foofind

            self.load_srcs_tables()

            self.sentry = raven.Client(cfg.get('sentry', 'dsn')) \
                if not nosentry and 'sentry' in cfg.sections() else None
        except IOError as e:  # file not found
            log.err('Configuration file not found: %s' % 'scrapy.properties')
            raise
        except NoSectionError as e:  # section missing
            log.err(str(e))
            raise
    
        self.fresh = fresh  # are we only picking fresh links?
    
        self.first_unknown_site = True  # used to warn at the 1st unknown site
Example #3
0
 def parseMobileProductDetail(self, response):
     err("555555555555555555555555")
     head = response.meta
     product_size_info = ProductSizeItem()
     product_sale_value = ProductSaleValueItem()
     size_json = re.search(
         "priceInfo\":(.*?),\"resultCode\":0,\"wanrentuanInfo",
         response.body)
     if size_json:
         size_json = size_json.group(1)
         size_json = size_json.decode("gbk")
         size_dict = json.loads(size_json)
         for skuid in size_dict:
             sku = size_dict[skuid]
             price = sku['price']['amount']
             print "skuid[" + str(skuid) + "] price[" + str(price) + "]"
             product_size_info['productId'] = response.meta['productId']
             product_size_info['taskId'] = response.meta['taskId']
             product_size_info['skuId'] = skuid
             product_size_info['promot_price'] = price
             yield product_size_info
     sale_num = re.search("sellCount\":([0-9])*", response.body)
     value_num = re.search("rateCounts\":([0-9])*", response.body)
     if value_num and value_num != None:
         product_sale_value["value_num"] = int(value_num.group(1))
     else:
         product_sale_value["value_num"] = -1
     if sale_num and value_num != None:
         product_sale_value["sale_num"] = int(sale_num.group(1))
     else:
         product_sale_value["sale_num"] = -1
     product_sale_value["productId"] = response.meta["productId"]
     product_sale_value["taskId"] = response.meta["taskId"]
     yield product_sale_value
Example #4
0
    def spider_closed(self, spider, reason):
        if hasattr(spider, 'debug') and spider.debug:
            log.msg(utils.Y(u'disable logger'), level=log.WARNING)
            return

        if hasattr(spider, 'logger'):
            try:
                from pymongo import uri_parser, MongoClient
                uri = spider.logger
                if not uri:
                    return

                log.msg('post bot stats to <{}>'.format(uri))
                cnn, db, tbl = utils.connect_uri(uri)

                ago = self.stats.get_value('start_time', datetime.utcnow())
                now = datetime.utcnow()

                self.stats.set_value('finish_time', now, spider=spider)
                self.stats.set_value('elapsed_time', (now-ago).total_seconds(), spider=spider)
                self.stats.set_value('finish_reason', reason, spider=spider)
                self.stats.set_value('bot_ip', utils.get_ipaddr('eth0'))
                self.stats.set_value('bot_name', self.crawler.settings.get('BOT_NAME', 'unknown'))
                self.stats.set_value('spider_name', spider.name)
                self.stats.set_value('config_path', spider.config)
                self.stats.set_value('job_id', os.getenv('SCRAPY_JOB', None))

                tbl.insert({k.replace('.', '_'):v for k,v in self.stats.get_stats().iteritems()})
                cnn.close()
            except Exception as ex:
                log.err('cannot post bot stats')
Example #5
0
 def start_requests(self):
     err("00000000000000000000000000")
     while True:
         conn = httplib.HTTPConnection("182.92.67.121", "8888")
         task_url = "/gettask?spider_name=" + self.spider_name + "&spider_type=" + self.name
         conn.request('GET', task_url)
         response_result = conn.getresponse().read()
         if response_result.find("taskId") == -1:
             continue
         if response_result.find("productId") == -1:
             continue
         if response_result.find("keyword") == -1:
             continue
         conn.close()
         response_dic = json.loads(response_result)
         pc_start_url = self.tmall_fp_url_prefix + response_dic[0][
             "keyword"] + self.tmall_fp_url_suffix
         head = {}
         seller_shop_name = response_dic[0]["keyword"]
         head["seller_shop_name"] = self.get_seller_shop_name(
             seller_shop_name)
         head["taskId"] = response_dic[0]["taskId"]
         head["list_url"] = pc_start_url
         print pc_start_url
         yield Request(pc_start_url,
                       callback=self.parse,
                       meta=head,
                       dont_filter=True)
Example #6
0
def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
    """Like pydispatcher.robust.sendRobust but it also logs errors and returns
    Failures instead of exceptions.
    """
    dont_log = named.pop('dont_log', None)
    spider = named.get('spider', None)
    responses = []
    for receiver in liveReceivers(getAllReceivers(sender, signal)):
        try:
            response = robustApply(receiver,
                                   signal=signal,
                                   sender=sender,
                                   *arguments,
                                   **named)
            if isinstance(response, Deferred):
                log.msg("Cannot return deferreds from signal handler: %s" % \
                    receiver, log.ERROR, spider=spider)
        except dont_log:
            result = Failure()
        except Exception:
            result = Failure()
            log.err(result, "Error caught on signal handler: %s" % receiver, \
                spider=spider)
        else:
            result = response
        responses.append((receiver, result))
    return responses
Example #7
0
    def __init__(self, fresh=False, nosentry=False, *args, **kwargs):
        super(TorrentsSpider, self).__init__(*args, **kwargs)

        try:
            cfg = ConfigParser()
            cfg.readfp(open('scrapy.properties'))

            self.db = pymongo.MongoClient(cfg.get('mongo', 'host'),
                                          int(cfg.get('mongo',
                                                      'port'))).torrents
            self.db_sources = pymongo.MongoClient(
                cfg.get('mongo', 'host_sources'),
                int(cfg.get('mongo', 'port_sources'))).foofind

            self.load_srcs_tables()

            self.sentry = raven.Client(cfg.get('sentry', 'dsn')) \
                if not nosentry and 'sentry' in cfg.sections() else None
        except IOError as e:  # file not found
            log.err('Configuration file not found: %s' % 'scrapy.properties')
            raise
        except NoSectionError as e:  # section missing
            log.err(str(e))
            raise

        self.fresh = fresh  # are we only picking fresh links?

        self.first_unknown_site = True  # used to warn at the 1st unknown site
 def parse_product_list(self,response):
     err("33333333333333333333333")
     head = response.meta
     hxs = HtmlXPathSelector(response)
     product_detail_url = hxs.select("//dd[@class='thumb']/../dt[@class='photo']/a/@href").extract()
     for letter in product_detail_url:
         yield Request("http:"+letter,callback=self.product,meta=head)
 def parseMobileProductDetail(self, response):
     err("555555555555555555555555")
     head = response.meta
     product_size_info = ProductSizeItem()
     product_sale_value = ProductSaleValueItem()
     size_json = re.search("priceInfo\":(.*?),\"resultCode\":0,\"wanrentuanInfo", response.body)
     if size_json:
         size_json = size_json.group(1)
         size_json = size_json.decode("gbk")
         size_dict = json.loads(size_json)
         for skuid in size_dict:
             sku = size_dict[skuid]
             price = sku['price']['amount']
             print "skuid[" + str(skuid) + "] price[" + str(price) + "]"
             product_size_info['productId'] = response.meta['productId']
             product_size_info['taskId'] = response.meta['taskId']
             product_size_info['skuId'] = skuid
             product_size_info['promot_price'] = price
             yield product_size_info
     sale_num = re.search("sellCount\":([0-9])*",response.body)
     value_num = re.search("rateCounts\":([0-9])*",response.body)
     if value_num and value_num != None :
         product_sale_value["value_num"] = int(value_num.group(1))
     else:
         product_sale_value["value_num"] = -1
     if sale_num and value_num != None :
         product_sale_value["sale_num"] = int(sale_num.group(1))
     else:
         product_sale_value["sale_num"] = -1
     product_sale_value["productId"] = response.meta["productId"]
     product_sale_value["taskId"] = response.meta["taskId"]
     yield product_sale_value 
Example #10
0
 def kaka(self,response):
     product = ProductItem()
     hxs=HtmlXPathSelector(response)
     he = response.meta
    # product_id_pattern="/([0-9]+)/"
     product_object=hxs.select("//div[@class='product-frame']/div[@class='table-holder']/table[@class='table-info']/tbody/tr/td[@class='discount']")
     err("===========================================")
     for index in range(len(product_object)):
        # product_productId=re.search(product_id_pattern,response.url)
         product_productId=product_object[index].select("./../td[@class='first']/div[1]/@id").extract()
         product['productId']=product_productId[0].replace("radio_","")
         product['brandId']=he['code']
         product_name=hxs.select("//div[@class='product-frame']/h2/text()").extract()
         product['name']=product_name[0]
         product_price=product_object[index].select("./../td[@class='last price']/text()").extract()
         product_price_one=product_price[0].replace(u"Â¥","")
         if "," in product_price_one:
             product['price']=product_price_one.replace(",","")
         else:
             product['price']=product_price_one
         product_standard=product_object[index].select("./../td[@class='first']/div[2]/label/text()").extract()
         product['standard']=product_standard[0].strip()
         product_introduce=hxs.select("//div[@class='tab-content']/div[@id='tab1_3']/div["+str(index+1)+"]/ul[@class='desc-ul']/li/text()").extract()
         product['introduce']="\n".join(product_introduce)
         yield product
Example #11
0
 def handle_spider_error(self, _failure, request, spider, propagated_failure=None):
     referer = request.headers.get('Referer', None)
     msg = "Spider error processing <%s> (referer: <%s>)" % \
         (request.url, referer)
     log.err(_failure, msg, spider=spider)
     stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \
         spider=spider)
Example #12
0
    def spider_closed(self, spider, reason):
        if self.enabled and hasattr(spider, 'logger'):
            try:
                uri = spider.logger
                if not uri:
                    return

                log.msg('post bot stats to <{}>'.format(uri))
                cnn, db, tbl = utils.connect_uri(uri)

                ago = self.stats.get_value('start_time', datetime.utcnow())
                now = datetime.utcnow()

                self.stats.set_value('finish_time', now, spider=spider)
                self.stats.set_value('elapsed_time',
                                     (now - ago).total_seconds(),
                                     spider=spider)
                self.stats.set_value('finish_reason', reason, spider=spider)
                self.stats.set_value('bot_ip', utils.get_ipaddr())
                self.stats.set_value(
                    'bot_name',
                    self.crawler.settings.get('BOT_NAME', 'unknown'))
                self.stats.set_value('spider_name', spider.name)
                self.stats.set_value('config_path', spider.config)
                self.stats.set_value('job_id', os.getenv('SCRAPY_JOB', None))

                tbl.insert({
                    k.replace('.', '_'): v
                    for k, v in self.stats.get_stats().iteritems()
                })
                cnn.close()
            except Exception as ex:
                log.err('cannot post bot stats')
Example #13
0
 def test_err_noargs(self):
     try:
         a = 1 / 0
     except:
         log.err()
     self.failUnless('Traceback' in self.logged())
     self.failUnless('ZeroDivisionError' in self.logged())
Example #14
0
    def media_downloaded(self, response, request, info):
        referer = request.headers.get('Referer')

        if response.status != 200:
            log.msg(format='File (code: %(status)s): Error downloading file from %(request)s referred in <%(referer)s>',
                    level=log.WARNING, spider=info.spider,
                    status=response.status, request=request, referer=referer)
            raise FileException('download-error')

        if not response.body:
            log.msg(format='File (empty-content): Empty file from %(request)s referred in <%(referer)s>: no-content',
                    level=log.WARNING, spider=info.spider,
                    request=request, referer=referer)
            raise FileException('empty-content')

        status = 'cached' if 'cached' in response.flags else 'downloaded'
        log.msg(format='File (%(status)s): Downloaded file from %(request)s referred in <%(referer)s>',
                level=log.DEBUG, spider=info.spider,
                status=status, request=request, referer=referer)
        self.inc_stats(info.spider, status)

        try:
            path = self.file_path(request, response=response, info=info)
            checksum = self.file_downloaded(response, request, info)
        except FileException as exc:
            whyfmt = 'File (error): Error processing file from %(request)s referred in <%(referer)s>: %(errormsg)s'
            log.msg(format=whyfmt, level=log.WARNING, spider=info.spider,
                    request=request, referer=referer, errormsg=str(exc))
            raise
        except Exception as exc:
            whyfmt = 'File (unknown-error): Error processing file from %(request)s referred in <%(referer)s>'
            log.err(None, whyfmt % {'request': request, 'referer': referer}, spider=info.spider)
            raise FileException(str(exc))

        return {'url': request.url, 'path': path, 'checksum': checksum}
Example #15
0
    def parse(self, response):
        site=Selector(response)
        base_xpath = site.xpath('//div[@id="resultList"]/div[@class="el"]')
        detail_urls=[]
        log.msg("current page is {0}".format(response.url), level=log.INFO)

        for var in base_xpath:
            try:
                item = Job51CrawlerItem()
                item['job_id'] = var.xpath('p[1]/input[1]/@value').extract()[0]
                # if is_job_id_exists(item['job_id']):continue

                item['job_name'] = var.xpath('p[1]/span[1]/a[1]/@title').extract()[0]
                item['job_url'] = var.xpath('p[1]/span[1]/a[1]/@href').extract()[0]
                detail_urls.append(item['job_url'])
                item['company_name'] = var.xpath('span[@class="t2"][1]/a[1]/@title').extract()[0]
                item['company_url'] = var.xpath('span[@class="t2"][1]/a[1]/@href').extract()[0]
                try:item['job_address'] = var.xpath('span[@class="t3"][1]/text()').extract()[0]
                except:item['job_address'] = ''
                try:item['job_salary'] = var.xpath('span[@class="t4"][1]/text()').extract()[0]
                except:item['job_salary'] = ''
                item['pub_date'] = var.xpath('span[@class="t5"][1]/text()').extract()[0]
                salary_temp = self.salary_unicode2int(item['job_salary'])

                if salary_temp == None:salary_low = 0;salary_high = 0
                else:salary_low = salary_temp.get('low');salary_high = salary_temp.get('high')
                item['salary_low'] = salary_low
                item['salary_high'] = salary_high
                yield scrapy.Request(item['job_url'], meta={'item': item}, callback=self.parse_detail)
            except Exception as e:log.err(e)
Example #16
0
    def process_item(self, item, spider):

        if item.__class__ == HouseItem:
            try:
                self.cursor.execute("""select * from house where id = %s""",
                                    item["id"])
                ret = self.cursor.fetchone()
                if ret:
                    self.cursor.execute(
                        """update house set h_name = %s,detail_url = %s,community_name = %s,
                            area = %s,pattern = %s,latitude = %s,longitude = %s,remark = %s
                            where id = %s""",
                        (item['h_name'], item['detail_url'],
                         item['community_name'], item['area'], item['pattern'],
                         item['latitude'], item['longitude'], item['remark'],
                         item['id']))
                    self.insert_or_update_house_daily(mode=2)
                else:
                    self.cursor.execute(
                        """insert into house(id,h_name,detail_url,community_name,area,
                          pattern,latitude,longitude, remark)
                          value (%s,%s,%s,%s,%s,%s,%s,%s,%s)""",
                        (item['id'], item['h_name'], item['detail_url'],
                         item['community_name'], item['area'], item['pattern'],
                         item['latitude'], item['longitude'], item['remark']))
                    self.insert_or_update_house_daily(mode=3)
                self.connect.commit()
            except Exception as error:
                log.err(error)
                print(error)
            return item
        else:
            pass
Example #17
0
    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception as exc:
                slot.start_requests = None
                log.err(None, 'Obtaining request from start requests', \
                        spider=spider)
            else:
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)
Example #18
0
    def media_downloaded(self, response, request, info):
        referer = request.headers.get('Referer')

        if response.status != 200:
            log.msg(format='Image (code: %(status)s): Error downloading image from %(request)s referred in <%(referer)s>',
                    level=log.WARNING, spider=info.spider,
                    status=response.status, request=request, referer=referer)
            raise ImageException

        if not response.body:
            log.msg(format='Image (empty-content): Empty image from %(request)s referred in <%(referer)s>: no-content',
                    level=log.WARNING, spider=info.spider,
                    request=request, referer=referer)
            raise ImageException

        status = 'cached' if 'cached' in response.flags else 'downloaded'
        log.msg(format='Image (%(status)s): Downloaded image from %(request)s referred in <%(referer)s>',
                level=log.DEBUG, spider=info.spider,
                status=status, request=request, referer=referer)
        self.inc_stats(info.spider, status)

        try:
            key = self.image_key(request.url)
            checksum = self.image_downloaded(response, request, info)
        except ImageException, ex:
            log.err('image_downloaded hook failed: %s' % ex,
                    level=log.WARNING, spider=info.spider)
            raise
Example #19
0
    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception as exc:
                log.err(None, 'Obtaining request from start requests', \
                        spider=spider)
            else:
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)
Example #20
0
 def _sql(self):
     if not self["table_name"] or not self["table_action"]:
         log.err("SQL: table_name or table_action is None")
         return None
     if self["table_action"] == "insert":
         n_str=""
         v_str=""
         for key_name in self["table_keys"]:
             if self.get(key_name):
                 n_str += key_name+","
                 v_str += "'%s'," %str(self.get(key_name)).replace("'","\\'")
             
         if v_str:
             _sql_str = "Insert into %s (%s) values (%s)  ON DUPLICATE KEY UPDATE " %(self["table_name"],n_str[:-1],v_str[:-1])
            # _sql_str = "REPLACE into %s (%s) values (%s)  where not exists(select * from %s where %s.%s = %s )" %(self["table_action"],self["table_name"],
             #    n_str[:-1],v_str[:-1],self["table_name"],self["table_name"],self["table_primray_key"],self.get(self["table_primray_key"]))
     elif self["table_action"] == "update" and self["table_primray_key"] and self.get(self["table_primray_key"]):
         
         v_str=""
         for key_name in self["table_keys"].keys():
             if key_name != self["table_primray_key"] and self.get(key_name):
         
                 v_str += key_name+"='"+str(self.get(key_name)).replace("'","\\'")+"'"
         if v_str:            
             _sql_str = "UPDATE %s SET %s WHERE %s=%s " %(self["table_name"],v_str,self["table_primray_key"],self.get(self["table_primray_key"]))
     
     return _sql_str
Example #21
0
 def test_err_noargs(self):
     try:
         a = 1/0
     except:
         log.err()
     self.failUnless('Traceback' in self.logged())
     self.failUnless('ZeroDivisionError' in self.logged())
Example #22
0
 def process_item(self, item, spider):
     if isinstance(item, NovelItem):
         # novelItem = dict(item)
         # self.novel_coll.insert(novelItem)
         try:
             self.cursor.execute(
                 """insert into novel_novel(id_book, book_name, author, category_id, status, image, description,
                                novel_url, update_time) value (%s, %s, %s, %s, %s, %s, %s, %s, %s)""",
                 (item['id_book'], item['book_name'], item['author'],
                  item['category_id'], item['status'], item['image'],
                  item['description'], item['novel_url'],
                  item['update_time']))
             self.conn.commit()
         except Exception as e:
             err(e)
     elif isinstance(item, ChapterItem):
         # chapterItem = dict(item)
         # self.chapter_coll.insert(chapterItem)
         try:
             self.cursor.execute(
                 """insert into novel_chapter(title, chapter_url, content, book_id, insert_num) value (%s, %s, %s, %s, %s)""",
                 (item['title'], item['chapter_url'], item['content'],
                  item['book_id'], item['insert_num']))
             self.conn.commit()
         except Exception as e:
             print(e)
     return item
Example #23
0
 def fetch_errback(self, twisted_request, failure):
     msg = (
         "The request to the web-server failed. "
         "The crawler engine returned an error: %s" % failure.getErrorMessage()
     )
     log.err(failure)
     finish_request(twisted_request, error=msg)
Example #24
0
    def process_item(self, item, spider):
        sql = ''
        params = ''

        if isinstance(item, VesselsItem):
            sql = 'insert into expected_vessels(today, date, ata_eta, vessel, cargo, quantity, ie, agent) values (%s, %s, %s, %s, %s, %s, %s, %s)'
            params = (item['today'], item['date'], item['ata_eta'], item['vessel'], item['cargo'], item['quantity'], item['ie'], item['agent'])

        elif isinstance(item, MovementItem):
            sql = 'insert into shipping_movement(date, state, vessel_name, berth_allotted, pilot_boarding_time) values (%s, %s, %s, %s, %s)'
            params = (item['date'], item['state'], item['vessel_name'], item['berth_allotted'], item['pilot_boarding_time'])

        elif isinstance(item, PositionItem):
            sql = '''insert into vessel_position(date, berth, vessel, ie, fc, date_of_berthing, cargo, 
            quantity, day_s_handling, up_to_date_hanfling, balance, load_or_discharge_port, agent) 
            values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
            params = (item['date'], item['berth'], item['vessel'], item['ie'], item['fc'], item['date_of_berthing'], item['cargo'], 
                item['quantity'], item['day_s_handling'], item['up_to_date_hanfling'], item['balance'], item['load_or_discharge_port'], item['agent'])

        try:
            self.cursor.execute(sql, params)
            self.connect.commit()
        except Exception as error:
            log.err(error)

        return item
Example #25
0
 def test_err_noargs(self):
     try:
         a = 1/0
     except:
         log.err()
     self.assertIn('Traceback', self.logged())
     self.assertIn('ZeroDivisionError', self.logged())
Example #26
0
 def messi(self,response):
     head=response.meta
     for letter in self.kaka():
         index=letter.find('search/')
         head['code']=letter[index+7]
         err("================================")
         yield Request(letter,callback=self.cluo,meta=head)
Example #27
0
 def item_completed(self, results, item, info):
     """Called per item when all media requests has been processed"""
     if self.LOG_FAILED_RESULTS:
         for success, result in results:
             if not success:
                 log.err(result, '%s found errors proessing %s' % (self.__class__.__name__, item))
     return item
Example #28
0
 def cluo(self,response):
     hea=response.meta
     hxs=HtmlXPathSelector(response)
     total=TotalItem()
     for letter in hxs.select("//div[@class='clear']/div[@class='ymppsy06 pleft30']/div[@class='ymppsy04']"):
         total_src=letter.select("./div[@class='ymppsy05']/a/img/@src").extract()
         total['src']=total_src[0]
         total_china=letter.select("./div[@class='txtCenter lh17 ptop3']/text()[1]").extract()
         total['china']=total_china[0]
         total_english=letter.select("./div[@class='txtCenter lh17 ptop3']/text()[2]").extract()
         total['english']=total_english[0]
         total['name']=total['china']+".jpg"
         total_hold=letter.select("./div[@class='lh20 pleft25 ptop5']/span[3]/text()").extract()
         if total_hold:
             total['hold']=total_hold[0]
         else:
             total['hold']="0"
         path="/dir/image_b/"+total['name']
         urllib.urlretrieve(total['src'],path)
         err("++++++++++++++++++++++++++++++++++++++++")
         yield total
     for lett in (hxs.select("//div[@class='ymppfy01 clear txtRright']/a[@class='ymppfy02']/text()").extract()):
         if re.search("[1-9]",lett):
             newurl= "http://hzp.rayli.com.cn/brandlist/search/"+hea['code']+"_3_"+lett+".html"
             err("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
             if newurl not in self.set_url:
                 (self.set_url).append(newurl)
                 yield Request(newurl,callback=self.cluo,meta=hea)
Example #29
0
 def test_err_noargs(self):
     try:
         a = 1 / 0
     except:
         log.err()
     self.assertIn('Traceback', self.logged())
     self.assertIn('ZeroDivisionError', self.logged())
Example #30
0
    def item_completed(self, results, item, info):
        if self.LOG_FAILED_RESULTS:
            msg = '%s found errors proessing %s' % (self.__class__.__name__,
                                                    item)
            for ok, value in results:
                if not ok:
                    log.err(value, msg, spider=info.spider)

        bookfile_paths_urls = [(x['path'], x['url']) for ok, x in results
                               if ok]
        bookfile_path_url = list_first_item(bookfile_paths_urls)
        if bookfile_path_url:
            item['book_file'] = os.path.join(
                os.path.abspath(self.bookfile_store), bookfile_path_url[0])
            item['book_file_url'] = bookfile_path_url[1]
            return item
        else:
            if self.item_download[item['original_url']]:
                next = list_first_item(
                    self.item_download[item['original_url']])
                self.item_download[item['original_url']] = self.item_download[
                    item['original_url']][1:]
                return Request(next)
            else:
                return item
Example #31
0
    def media_downloaded(self, response, request, info):
        referer = request.headers.get("Referer")
        if response.status != 200:
            log.msg(
                format="Image (code: %(status)s): Error downloading image from %(request)s referred in <%(referer)s>",
                level=log.WARNING,
                spider=info.spider,
                status=response.status,
                request=request,
                referer=referer,
            )
            raise ImageException("download-error")

        if not response.body:
            log.msg(
                format="Image (empty-content): Empty image from %(request)s referred in <%(referer)s>: no-content",
                level=log.WARNING,
                spider=info.spider,
                request=request,
                referer=referer,
            )
            raise ImageException("empty-content")

        status = "cached" if "cached" in response.flags else "downloaded"
        log.msg(
            format="Image (%(status)s): Downloaded image from %(request)s referred in <%(referer)s>",
            level=log.DEBUG,
            spider=info.spider,
            status=status,
            request=request,
            referer=referer,
        )
        self.inc_stats(info.spider, status)

        try:
            key = self.image_key(request.url)
            result_hash = self.image_downloaded(response, request, info)
            checksum = result_hash["checksum"]
            width = result_hash["width"]
            height = result_hash["height"]
            size = result_hash["size"]
            self.inc_image_size(info.spider, size)
        except ImageException as exc:
            whyfmt = "Image (error): Error processing image from %(request)s referred in <%(referer)s>: %(errormsg)s"
            log.msg(
                format=whyfmt,
                level=log.WARNING,
                spider=info.spider,
                request=request,
                referer=referer,
                errormsg=str(exc),
            )
            raise
        except Exception as exc:
            whyfmt = "Image (unknown-error): Error processing image from %(request)s referred in <%(referer)s>"
            log.err(None, whyfmt % {"request": request, "referer": referer}, spider=info.spider)
            raise ImageException(str(exc))

        return {"url": request.url, "path": key, "checksum": checksum, "width": width, "height": height, "size": size}
    def full_run_required(self):
        if not os.path.exists(
                os.path.join(HERE, 'chainreactioncycles_products.csv')):
            log.err("Does not exist")
            return True

        # run full only on Mondays
        return datetime.now().weekday() == 0
Example #33
0
 def item_completed(self, results, item, info):
     """Called per item when all media requests has been processed"""
     if self.LOG_FAILED_RESULTS:
         msg = '%s found errors proessing %s' % (self.__class__.__name__, item)
         for ok, value in results:
             if not ok:
                 log.err(value, msg, spider=info.spider)
     return item
Example #34
0
 def parse_product_list(self, response):
     err("33333333333333333333333")
     head = response.meta
     hxs = HtmlXPathSelector(response)
     product_detail_url = hxs.select(
         "//dd[@class='thumb']/../dt[@class='photo']/a/@href").extract()
     for letter in product_detail_url:
         yield Request("http:" + letter, callback=self.product, meta=head)
 def product_list(self,response):
     err("**************************")
     head = response.meta
     for letter in self.seller_name:
         list_url = "http://"+letter+".tmall.hk/search.htm"
         head["list_url"] = list_url
         head["seller_shop_name"] = self.get_seller_shop_name(letter)
         yield  Request(list_url,callback=self.get_product_page,meta=head)
Example #36
0
 def process_item(self, item, spider):
     try:
         line = item['name']+","+item['address']+"\n"
         self.file.write(line)
     except BaseException,x:
         print 'Error:',x
         print item
         log.err(x.message)
Example #37
0
 def handle_spider_error(self, _failure, request, response, spider):
     exc = _failure.value
     if isinstance(exc, CloseSpider):
         self.crawler.engine.close_spider(spider, exc.reason or "cancelled")
         return
     log.err(_failure, "Spider error processing %s" % request, spider=spider)
     send_catch_log(signal=signals.spider_error, failure=_failure, response=response, spider=spider)
     stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider)
Example #38
0
 def item_completed(self, results, item, info):
     """Called per item when all media requests has been processed"""
     if self.LOG_FAILED_RESULTS:
         msg = '%s found errors proessing %s' % (self.__class__.__name__, item)
         for ok, value in results:
             if not ok:
                 log.err(value, msg, spider=info.spider)
     return item
Example #39
0
 def kaka(self):
     url=[]
     string_a="http://hzp.rayli.com.cn/brandlist/search/"
     mu=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
     for letter in mu:
         url.append(string_a+letter+"_3_1.html")
     (self.set_url).extend(url)
     err("########################################")
     return url
 def get_product_page(self,response):
     err("%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
     hea = response.meta
     hxs = HtmlXPathSelector(response)
     list_page_num = hxs.select("//b[@class='ui-page-s-len']/text()").extract()
     list_page_num = list_page_num[0].split("/")[1] 
     for le in range(1,int(list_page_num)+1):
         product_list_url = hea["list_url"] + "?pageNo=" + str(le)
         yield Request(product_list_url,callback=self.parse_product_list,meta=hea)
Example #41
0
def pyproducer(topic,_key,_value):
    log.msg('sending message!')
    producer = KafkaProducer(bootstrap_servers='115.231.103.59:9092,115.231.103.212:9092,115.231.103.60:9092',retries=3,api_version='0.8.2')
    future = producer.send(topic,key=_key,value=_value)
    try:
        record_metadata = future.get(timeout=10)
    except Exception,e:
        # Decide what to do if produce request failed...
        log.err(str(e))
        pass
Example #42
0
 def handle_spider_error(self, _failure, request, response, spider):
     exc = _failure.value
     if isinstance(exc, CloseSpider):
         self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
         return
     log.err(_failure, "Spider error processing %s" % request, spider=spider)
     self.signals.send_catch_log(signal=signals.spider_error, failure=_failure, response=response, \
         spider=spider)
     self.crawler.stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \
         spider=spider)
Example #43
0
    def media_downloaded(self, response, request, info):
        referer = request.headers.get("Referer")

        if response.status != 200:
            log.msg(
                format="File (code: %(status)s): Error downloading file from %(request)s referred in <%(referer)s>",
                level=log.WARNING,
                spider=info.spider,
                status=response.status,
                request=request,
                referer=referer,
            )
            raise FileException("download-error")

        if not response.body:
            log.msg(
                format="File (empty-content): Empty file from %(request)s referred in <%(referer)s>: no-content",
                level=log.WARNING,
                spider=info.spider,
                request=request,
                referer=referer,
            )
            raise FileException("empty-content")

        status = "cached" if "cached" in response.flags else "downloaded"
        log.msg(
            format="File (%(status)s): Downloaded file from %(request)s referred in <%(referer)s>",
            level=log.DEBUG,
            spider=info.spider,
            status=status,
            request=request,
            referer=referer,
        )
        self.inc_stats(info.spider, status)

        try:
            path = self.file_path(request, response=response, info=info)
            checksum = self.file_downloaded(response, request, info)
        except FileException as exc:
            whyfmt = "File (error): Error processing file from %(request)s referred in <%(referer)s>: %(errormsg)s"
            log.msg(
                format=whyfmt,
                level=log.WARNING,
                spider=info.spider,
                request=request,
                referer=referer,
                errormsg=str(exc),
            )
            raise
        except Exception as exc:
            whyfmt = "File (unknown-error): Error processing file from %(request)s referred in <%(referer)s>"
            log.err(None, whyfmt % {"request": request, "referer": referer}, spider=info.spider)
            raise FileException(str(exc))

        return {"url": request.url, "path": path, "checksum": checksum}
Example #44
0
 def handle_spider_error(self,
                         _failure,
                         request,
                         spider,
                         propagated_failure=None):
     referer = request.headers.get('Referer', None)
     msg = "Spider error processing <%s> (referer: <%s>)" % \
         (request.url, referer)
     log.err(_failure, msg, spider=spider)
     stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \
         spider=spider)
Example #45
0
    def open_spider(self, spider):
        if hasattr(spider, 'mongo'):
            try:
                uri = spider.mongo
                log.msg('connect <{}>'.format(uri))
                self.cnn, self.db, self.tbl = utils.connect_uri(uri)
                return
            except Exception as ex:
                log.err('cannot connect to mongodb: {}'.format(ex))

        self.cnn = self.db = None
Example #46
0
    def item_completed(self, results, item, info):
        if self.LOG_FAILED_RESULTS:
            msg = '%s found errors proessing %s' % (self.__class__.__name__, item)
            for ok, value in results:
                if not ok:
                    log.err(value, msg, spider=info.spider)

        image_paths = [x['path'] for ok, x in results if ok]
        image_path = list_first_item(image_paths)
        item['book_covor_image_path'] = os.path.join(os.path.abspath(self.images_store),image_path) if image_path else ""

        return item
Example #47
0
    def open_spider(self, spider):
        if hasattr(spider, 'zmq'):
            try:
                from utils.api import MessageSender
                uri = spider.zmq
                log.msg('connect <{}>'.format(uri))
                self.sender = MessageSender(uri)
                return
            except Exception as ex:
                log.err('cannot connect to zmq: {}'.format(ex))

        self.sender = None
Example #48
0
 def parse(self,response):
     search_result = re.search('"&token=([0-9a-z]{32})"',response.body_as_unicode())
     if search_result:
         token = search_result.groups()[0]
         token_item = Token()
         token_item['type'] = self.name
         token_item['token'] = token
         token_item['token_date'] = datetime.datetime.now().strftime('%Y%m%d')
         yield token_item
     else:
         log.err('token get error!')
         yield None
Example #49
0
    def open_spider(self, spider):
        if hasattr(spider, 'zmq'):
            try:
                self.zmq = __import__('zmq')
                uri = spider.zmq
                log.msg('connect <{}>'.format(uri))
                self.ctx, self.skt, _ = utils.connect_uri(uri)
                return
            except Exception as ex:
                log.err('cannot connect to zmq: {}'.format(ex))

        self.ctx = self.skt = None
Example #50
0
 def parse_page2(self, response):
     urls = None
     try:
         try:
             urls = response.xpath('//div[@class="blog_title"]/a/@href').extract()
         except:
             urls = response.xpath('//span[@class="atc_title"]/strong/a/@href').extract()
     except:
         log.err('Cannot extract any blog urls from %s' % response.url)
     for url in urls:
         # log.msg('scarped new post:' + url)
         yield scrapy.Request(url, callback=self.parse_page3)
Example #51
0
    def open_spider(self, spider):
        if hasattr(spider, 'mysql'):
            try:
                uri = spider.mysql
                log.msg('connect <{}>'.format(uri))
                self.cnn, _, self.tbl = utils.connect_uri(uri)
                self.cur = self.cnn.cursor()
                return
            except Exception as ex:
                traceback.print_exc()
                log.err('cannot connect to mysql: {}'.format(ex))

        self.cnn = self.cur = None
Example #52
0
    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception as exc:
                slot.start_requests = None
                log.err(None, 'Obtaining request from start requests', \
                        spider=spider)
            else:
                self.crawl(request, spider)
        elif not slot.start_requests:
            if self.spider_is_idle(spider):
                self.locker.acquire()
                try:
                    if not self._next_request_from_scheduler(spider):
                        log.msg(message = 'Request queue is empty, get request from start requests', _level = log.INFO)
                        assert spider in self.open_spiders, \
                            "Spider %r not opened when crawling: %s" % (spider.name, request)
                        start_requests = spider.start_requests()
                        while True:
                            try:
                                request = next(start_requests)
                            except StopIteration:
                                break
                            except Exception as exc:
                                log.err(None, 'Obtaining request from start requests', \
                                        spider=spider)
                                break
                            else:
                                self.schedule(request, spider)
                        slot.nextcall.schedule()
                finally:
                    self.locker.release()

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)
Example #53
0
 def fetch_callback(self, response):
     request = response.meta['twisted_request']
     result_response = dict(status=response.status,
                            headers=response.headers.to_string())
     if response.status != 200:
         finish_request(request, response=result_response)
         return
     if not isinstance(response, (HtmlResponse, XmlResponse)):
         msg = "Non-html response: %s" % response.headers.get(
             'content-type', 'no content type')
         finish_request(request, error=msg)
         return
     try:
         params = response.meta['slyd_request_params']
         original_html = extract_html(response)
         cleaned_html = html4annotation(original_html, response.url)
         # we may want to include some headers
         fingerprint = request_fingerprint(response.request)
         result_response = dict(status=response.status,
                                headers=response.headers.to_string())
         result = dict(page=cleaned_html,
                       original=original_html,
                       fp=fingerprint,
                       response=result_response)
         # HACKY: return the spider but also return the template specs.
         # We need them to map the template_id to the template name.
         spider, templates = self.create_spider(request.project,
                                                request.auth_info, params)
         if spider is not None:
             items = []
             links = []
             for value in spider.parse(response):
                 if isinstance(value, Request):
                     links.append(value.url)
                 elif isinstance(value, DictItem):
                     value['_template_name'] = self._get_template_name(
                         value['_template'], templates)
                     items.append(value._values)
                 else:
                     raise ValueError("Unexpected type %s from spider" %
                                      type(value))
             result['items'] = items
             result['links'] = links
         finish_request(request, **result)
     except Exception as ex:
         log.err(ex)
         finish_request(request,
                        response=result_response,
                        error="unexpected internal error: %s" % ex)
Example #54
0
 def _check_propagated_failure(self, spider_failure, propagated_failure,
                               request, spider):
     """Log and silence the bugs raised outside of spiders, but still allow
     spiders to be notified about general failures while downloading spider
     generated requests
     """
     # ignored requests are commonly propagated exceptions safes to be silenced
     if isinstance(spider_failure.value, IgnoreRequest):
         return
     elif spider_failure is propagated_failure:
         log.err(spider_failure, 'Unhandled error propagated to spider', \
             spider=spider)
         return  # stop propagating this error
     else:
         return spider_failure  # exceptions raised in the spider code
Example #55
0
    def parse_resume(self, response):
        resume = response.meta['resume']
        data = json.loads(response.body)['resumeHtml']
        try:
            log.msg(
                'http://www.zhaopingou.com/resume/detail?source=1&resumeId=%s'
                % (data['id']))

            resume[
                'url'] = 'http://www.zhaopingou.com/resume/detail?source=1&resumeId=%s' % (
                    data['id'])

            resume['photo'] = data['resumeImg']
            resume['birthday'] = data['birthday']
            resume['hometown'] = data['residence']
            resume['age'] = data['age']
            resume['live_city'] = data['address']
            resume['degree'] = data['degreesName']

            if data.has_key('evaluate'):
                resume['self_intro'] = data['evaluate']

            resume['exp_mode'] = ''
            resume['exp_city'] = data['hopeAddress']
            log.msg(resume['exp_city'])
            resume['exp_pos'] = data['hopePosition']
            resume['exp_industry'] = data['hopeIndustry']
            resume['exp_salary'] = data['hopeSalary']
            tmplist = []
            tmp_dict = {}
            tmp_dict['company'] = data['last_company']
            tmp_dict['position'] = data['last_company_pname']
            tmp_dict['time_range'] = data['last_company_time']
            tmplist.append(tmp_dict)
            resume['work_exp'] = json.dumps(tmplist, ensure_ascii=False)
            resume['work_exp_ds'] = ''
            resume['edu_exp'] = ''
            resume['lang_skill'] = ''
            resume['cert'] = ''
            resume['ability'] = data['skills']
            resume['showme'] = ''
            resume['refresh_time'] = data['crate_time']

            #resume['html_filepath'] = 'http://www.zhaopingou.com'+data['html_filepath']
            #resume['doc_filepath'] = 'http://www.zhaopingou.com'+data['doc_filepath']
            yield resume
        except Exception, e:
            log.err(e)