Beispiel #1
0
 def __init__(self):
     if not settings.getbool('REDIRECT_ENABLED'):
         raise NotConfigured
     self.max_metarefresh_delay = settings.getint(
         'REDIRECT_MAX_METAREFRESH_DELAY')
     self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
     self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
Beispiel #2
0
 def from_settings(cls, settings):
     cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0)
     cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0)
     cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90)
     cls.THUMBS = settings.get('IMAGES_THUMBS', {})
     cls.IMAGES_URLS_FIELD = settings.get('IMAGES_URLS_FIELD', cls.DEFAULT_IMAGES_URLS_FIELD)
     cls.IMAGES_RESULT_FIELD = settings.get('IMAGES_RESULT_FIELD', cls.DEFAULT_IMAGES_RESULT_FIELD)
     store_uri = settings['IMAGES_STORE']
     return cls(store_uri)
Beispiel #3
0
 def from_settings(cls, settings):
     cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0)
     cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0)
     cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90)
     cls.THUMBS = {}
     s3store = cls.STORE_SCHEMES['s3']
     s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
     s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
     store_uri = settings['IMAGES_STORE']
     return cls(store_uri)
Beispiel #4
0
 def from_settings(cls, settings):
     cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0)
     cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0)
     cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90)
     cls.THUMBS = {}
     s3store = cls.STORE_SCHEMES['s3']
     s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
     s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
     store_uri = settings['IMAGES_STORE']
     return cls(store_uri)
Beispiel #5
0
 def _print_setting(self, opts):
     if opts.get:
         print settings_.get(opts.get)
     elif opts.getbool:
         print settings_.getbool(opts.getbool)
     elif opts.getint:
         print settings_.getint(opts.getint)
     elif opts.getfloat:
         print settings_.getfloat(opts.getfloat)
     elif opts.getlist:
         print settings_.getlist(opts.getlist)
Beispiel #6
0
 def from_settings(cls, settings):
     cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0)
     cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0)
     cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90)
     cls.THUMBS = settings.get('IMAGES_THUMBS', {})
     cls.IMAGES_URLS_FIELD = settings.get('IMAGES_URLS_FIELD',
                                          cls.DEFAULT_IMAGES_URLS_FIELD)
     cls.IMAGES_RESULT_FIELD = settings.get('IMAGES_RESULT_FIELD',
                                            cls.DEFAULT_IMAGES_RESULT_FIELD)
     store_uri = settings['IMAGES_STORE']
     return cls(store_uri)
Beispiel #7
0
    def __init__(self):
        self.timeout = settings.getint('CLOSESPIDER_TIMEOUT')
        self.itempassed = settings.getint('CLOSESPIDER_ITEMPASSED')

        self.counts = defaultdict(int)
        self.tasks = {}

        if self.timeout:
            dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        if self.itempassed:
            dispatcher.connect(self.item_passed, signal=signals.item_passed)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
Beispiel #8
0
    def __init__(self):
        if not settings.getbool("MEMUSAGE_ENABLED"):
            raise NotConfigured
        if not procfs_supported():
            raise NotConfigured

        self.warned = False
        self.notify_mails = settings.getlist("MEMUSAGE_NOTIFY")
        self.limit = settings.getint("MEMUSAGE_LIMIT_MB") * 1024 * 1024
        self.warning = settings.getint("MEMUSAGE_WARNING_MB") * 1024 * 1024
        self.report = settings.getbool("MEMUSAGE_REPORT")
        self.mail = MailSender()
        dispatcher.connect(self.engine_started, signal=signals.engine_started)
        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
Beispiel #9
0
    def __init__(self):
        if not settings.getbool('MEMUSAGE_ENABLED'):
            raise NotConfigured
        if not os.path.exists('/proc'):
            raise NotConfigured

        self.warned = False
        self.notify_mails = settings.getlist('MEMUSAGE_NOTIFY')
        self.limit = settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024
        self.warning = settings.getint('MEMUSAGE_WARNING_MB')*1024*1024
        self.report = settings.getbool('MEMUSAGE_REPORT')
        self.mail = MailSender()
        dispatcher.connect(self.engine_started, signal=signals.engine_started)
        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
Beispiel #10
0
    def __init__(self):
        if not settings.getbool('MEMUSAGE_ENABLED'):
            raise NotConfigured
        if not procfs_supported():
            raise NotConfigured

        self.warned = False
        self.notify_mails = settings.getlist('MEMUSAGE_NOTIFY')
        self.limit = settings.getint('MEMUSAGE_LIMIT_MB') * 1024 * 1024
        self.warning = settings.getint('MEMUSAGE_WARNING_MB') * 1024 * 1024
        self.report = settings.getbool('MEMUSAGE_REPORT')
        self.mail = MailSender()
        dispatcher.connect(self.engine_started, signal=signals.engine_started)
        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
Beispiel #11
0
 def __init__(self, settings):
     # 保存上次不用代理直接连接的时间点
     self.last_no_proxy_time = datetime.now()
     # 一定分钟数后切换回不用代理, 因为用代理影响到速度
     self.recover_interval = 20
     # 一个proxy如果没用到这个数字就被发现老是超时, 则永久移除该proxy. 设为0则不会修改代理文件.
     self.dump_count_threshold = 20
     # 是否在超时的情况下禁用代理
     self.invalid_proxy_flag = True
     # 当有效代理小于这个数时(包括直连), 从网上抓取新的代理, 可以将这个数设为为了满足每个ip被要求输入验证码后得到足够休息时间所需要的代理数
     # 例如爬虫在十个可用代理之间切换时, 每个ip经过数分钟才再一次轮到自己, 这样就能get一些请求而不用输入验证码.
     # 如果这个数过小, 例如两个, 爬虫用A ip爬了没几个就被ban, 换了一个又爬了没几次就被ban,
     # 这样整个爬虫就会处于一种忙等待的状态, 影响效率
     self.extend_proxy_threshold = 5
     # 初始化代理列表
     self.proxyes = [{"proxy": None, "valid": True, "count": 0}]
     # 初始时使用0号代理(即无代理)
     self.proxy_index = 0
     # 表示可信代理的数量(如自己搭建的HTTP代理)+1(不用代理直接连接)
     self.fixed_proxy = len(self.proxyes)
     # 上一次抓新代理的时间
     self.last_fetch_proxy_time = datetime.now()
     # 每隔固定时间强制抓取新代理(min)
     self.fetch_proxy_interval = 120
     # 一个将被设为invalid的代理如果已经成功爬取大于这个参数的页面, 将不会被invalid
     self.invalid_proxy_threshold = 200
     # 在开始执行爬虫时,先另起线程去抓代理ip
     self.threadLock = threading.Lock()
     self.proxysStatus = 0  # 0:未爬取代理,1:正在爬取代理,2:已经抓完代理ip
     self.max_retry_times = settings.getint('RETRY_TIMES')
     # 有fail_count_threadhold次爬取失败就移除该代理
     self.fail_count_threadhold = 3
Beispiel #12
0
 def __init__(self):
     if not settings.getbool('TELNETCONSOLE_ENABLED'):
         raise NotConfigured
     self.protocol = makeProtocol
     self.noisy = False
     port = settings.getint('TELNETCONSOLE_PORT')
     reactor.callWhenRunning(reactor.listenTCP, port, self)
Beispiel #13
0
    def __init__(self):
        self.delay = settings.getint('SPIDER_CLOSE_DELAY')
        if not self.delay:
            raise NotConfigured

        self.opened_at = defaultdict(time)
        dispatcher.connect(self.spider_idle, signal=signals.spider_idle)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
Beispiel #14
0
 def __init__(self):
     if not settings.getbool('WEBCONSOLE_ENABLED'):
         raise NotConfigured
     logfile = settings['WEBCONSOLE_LOGFILE']
     server.Site.__init__(self, WebConsoleResource(), logPath=logfile)
     self.noisy = False
     port = settings.getint('WEBCONSOLE_PORT')
     reactor.callWhenRunning(reactor.listenTCP, port, self)
Beispiel #15
0
    def __init__(self):
        self.delay = settings.getint('SPIDER_CLOSE_DELAY')
        if not self.delay:
            raise NotConfigured

        self.opened_at = defaultdict(time)
        dispatcher.connect(self.spider_idle, signal=signals.spider_idle)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
Beispiel #16
0
    def __init__(self, smtphost=None, mailfrom=None, smtpuser=None, smtppass=None, \
            smtpport=None):
        self.smtphost = smtphost or settings['MAIL_HOST']
        self.smtpport = smtpport or settings.getint('MAIL_PORT')
        self.smtpuser = smtpuser or settings['MAIL_USER']
        self.smtppass = smtppass or settings['MAIL_PASS']
        self.mailfrom = mailfrom or settings['MAIL_FROM']

        if not self.smtphost or not self.mailfrom:
            raise NotConfigured("MAIL_HOST and MAIL_FROM settings are required")
Beispiel #17
0
    def __init__(self):
        self.max_queue_size = settings.getint("REQUESTS_QUEUE_SIZE")
        if not self.max_queue_size:
            raise NotConfigured

        self.max_pending = {}
        self.dropped_count = {}

        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
Beispiel #18
0
    def __init__(self):
        self.timeout = settings.getint('CLOSESPIDER_TIMEOUT')
        self.itempassed = settings.getint('CLOSESPIDER_ITEMPASSED')
        self.pagecount = settings.getint('CLOSESPIDER_PAGECOUNT')
        self.errorcount = settings.getint('CLOSESPIDER_ERRORCOUNT')

        self.errorcounts = defaultdict(int)
        self.pagecounts = defaultdict(int)
        self.counts = defaultdict(int)
        self.tasks = {}

        if self.errorcount:
            txlog.addObserver(self.catch_log)
        if self.pagecount:
            dispatcher.connect(self.page_count, signal=signals.response_received)
        if self.timeout:
            dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        if self.itempassed:
            dispatcher.connect(self.item_passed, signal=signals.item_passed)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
Beispiel #19
0
    def __init__(self):
        try:
            from redis import Redis
        except ImportError:
            raise NotConfigured

        # get settings
        queue = settings.get('REDIS_QUEUE')
        if queue is None:
            raise NotConfigured

        host = settings.get('REDIS_HOST', 'localhost')
        port = settings.getint('REDIS_PORT', 6379)
        db = settings.getint('REDIS_DB', 0)
        password = settings.get('REDIS_PASSWORD')

        self.redis = Redis(host=host, port=port, db=db, password=password)
        self.queue = queue

        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
 def wrapper(*args, **kwargs):
     max_attempts = settings.getint("MAX_MONGO_RECONNECT_ATTEMPTS", MAX_AUTO_RECONNECT_ATTEMPTS)
     mail = MailSender()
     for attempt in xrange(max_attempts):
         try:
             return mongo_op_func(*args, **kwargs)
         except AutoReconnect as e:
             wait_t = 1 + attempt # exponential back off
             log.msg("PyMongo auto-reconnecting... %s. Waiting %.1f seconds."%(str(e), wait_t), log.INFO)
             mail.send(to=[settings.get('MAIL_TO')], subject='PyMongo auto-reconnecting....', \
                   body="%s\n%s"%(e, traceback.format_exc()))
             time.sleep(wait_t)
Beispiel #21
0
    def __init__(self, smtphost=None, mailfrom=None, smtpuser=None, smtppass=None, \
            smtpport=None, debug=False):
        self.smtphost = smtphost or settings['MAIL_HOST']
        self.smtpport = smtpport or settings.getint('MAIL_PORT')
        self.smtpuser = smtpuser or settings['MAIL_USER']
        self.smtppass = smtppass or settings['MAIL_PASS']
        self.mailfrom = mailfrom or settings['MAIL_FROM']
        self.debug = debug

        if not self.smtphost or not self.mailfrom:
            raise NotConfigured(
                "MAIL_HOST and MAIL_FROM settings are required")
Beispiel #22
0
    def __init__(self, crawler):
        self.crawler = crawler
        self.timeout = settings.getint('CLOSESPIDER_TIMEOUT')
        self.itemcount = settings.getint('CLOSESPIDER_ITEMCOUNT')
        # XXX: legacy support - remove for future releases
        if settings.getint('CLOSESPIDER_ITEMPASSED'):
            warnings.warn(
                "CLOSESPIDER_ITEMPASSED setting is deprecated, use CLOSESPIDER_ITEMCOUNT instead",
                ScrapyDeprecationWarning)
            self.pagecount = settings.getint('CLOSESPIDER_ITEMPASSED')
        self.pagecount = settings.getint('CLOSESPIDER_PAGECOUNT')
        self.errorcount = settings.getint('CLOSESPIDER_ERRORCOUNT')

        self.errorcounts = defaultdict(int)
        self.pagecounts = defaultdict(int)
        self.counts = defaultdict(int)
        self.tasks = {}

        if self.errorcount:
            txlog.addObserver(self.catch_log)
        if self.pagecount:
            dispatcher.connect(self.page_count,
                               signal=signals.response_received)
        if self.timeout:
            dispatcher.connect(self.spider_opened,
                               signal=signals.spider_opened)
        if self.itemcount:
            dispatcher.connect(self.item_scraped, signal=signals.item_scraped)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
Beispiel #23
0
    def __init__(self, crawler):
        self.crawler = crawler
        self.timeout = settings.getint('CLOSESPIDER_TIMEOUT')
        self.itemcount = settings.getint('CLOSESPIDER_ITEMCOUNT')
        # XXX: legacy support - remove for future releases
        if settings.getint('CLOSESPIDER_ITEMPASSED'):
            warnings.warn("CLOSESPIDER_ITEMPASSED setting is deprecated, use CLOSESPIDER_ITEMCOUNT instead", ScrapyDeprecationWarning)
            self.pagecount = settings.getint('CLOSESPIDER_ITEMPASSED')
        self.pagecount = settings.getint('CLOSESPIDER_PAGECOUNT')
        self.errorcount = settings.getint('CLOSESPIDER_ERRORCOUNT')

        self.errorcounts = defaultdict(int)
        self.pagecounts = defaultdict(int)
        self.counts = defaultdict(int)
        self.tasks = {}

        if self.errorcount:
            txlog.addObserver(self.catch_log)
        if self.pagecount:
            dispatcher.connect(self.page_count, signal=signals.response_received)
        if self.timeout:
            dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        if self.itemcount:
            dispatcher.connect(self.item_scraped, signal=signals.item_scraped)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
Beispiel #24
0
    def __init__(self):
        self.timeout = settings.getint('CLOSESPIDER_TIMEOUT')
        self.itempassed = settings.getint('CLOSESPIDER_ITEMPASSED')
        self.pagecount = settings.getint('CLOSESPIDER_PAGECOUNT')
        self.errorcount = settings.getint('CLOSESPIDER_ERRORCOUNT')

        self.errorcounts = defaultdict(int)
        self.pagecounts = defaultdict(int)
        self.counts = defaultdict(int)
        self.tasks = {}

        if self.errorcount:
            txlog.addObserver(self.catch_log)
        if self.pagecount:
            dispatcher.connect(self.page_count,
                               signal=signals.response_received)
        if self.timeout:
            dispatcher.connect(self.spider_opened,
                               signal=signals.spider_opened)
        if self.itempassed:
            dispatcher.connect(self.item_passed, signal=signals.item_passed)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
Beispiel #25
0
 def __init__(self):
     if not settings.getbool('WEBSERVICE_ENABLED'):
         raise NotConfigured
     logfile = settings['WEBSERVICE_LOGFILE']
     port = settings.getint('WEBSERVICE_PORT')
     root = RootResource()
     reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \
         settings['WEBSERVICE_RESOURCES'])
     for res_cls in map(load_object, reslist):
         res = res_cls()
         root.putChild(res.ws_name, res)
     server.Site.__init__(self, root, logPath=logfile)
     self.noisy = False
     reactor.callWhenRunning(reactor.listenTCP, port, self)
Beispiel #26
0
 def __init__(self, settings):
     self.settings = settings
     self.urifmt = settings['FEED_URI']
     if not self.urifmt:
         raise NotConfigured
     self.format = settings['FEED_FORMAT'].lower()
     self.export_encoding = settings['FEED_EXPORT_ENCODING']
     self.storages = self._load_components('FEED_STORAGES')
     self.exporters = self._load_components('FEED_EXPORTERS')
     if not self._storage_supported(self.urifmt):
         raise NotConfigured
     if not self._exporter_supported(self.format):
         raise NotConfigured
     self.store_empty = settings.getbool('FEED_STORE_EMPTY')
     self._exporting = False
     self.export_fields = settings.getlist('FEED_EXPORT_FIELDS') or None
     self.indent = None
     if settings.get('FEED_EXPORT_INDENT') is not None:
         self.indent = settings.getint('FEED_EXPORT_INDENT')
     uripar = settings['FEED_URI_PARAMS']
     self._uripar = load_object(uripar) if uripar else lambda x, y: None
Beispiel #27
0
 def __init__(self, settings):
     self.settings = settings
     self.urifmt = settings['FEED_URI']
     if not self.urifmt:
         raise NotConfigured
     self.format = settings['FEED_FORMAT'].lower()
     self.export_encoding = settings['FEED_EXPORT_ENCODING']
     self.storages = self._load_components('FEED_STORAGES')
     self.exporters = self._load_components('FEED_EXPORTERS')
     if not self._storage_supported(self.urifmt):
         raise NotConfigured
     if not self._exporter_supported(self.format):
         raise NotConfigured
     self.store_empty = settings.getbool('FEED_STORE_EMPTY')
     self._exporting = False
     self.export_fields = settings.getlist('FEED_EXPORT_FIELDS') or None
     self.indent = None
     if settings.get('FEED_EXPORT_INDENT') is not None:
         self.indent = settings.getint('FEED_EXPORT_INDENT')
     uripar = settings['FEED_URI_PARAMS']
     self._uripar = load_object(uripar) if uripar else lambda x, y: None
Beispiel #28
0
    def __init__(self, download_delay=None, max_concurrent_requests=None):
        if download_delay is None:
            self._download_delay = settings.getfloat("DOWNLOAD_DELAY")
        else:
            self._download_delay = float(download_delay)
        if self._download_delay:
            self.max_concurrent_requests = 1
        elif max_concurrent_requests is None:
            self.max_concurrent_requests = settings.getint("CONCURRENT_REQUESTS_PER_SPIDER")
        else:
            self.max_concurrent_requests = max_concurrent_requests
        if self._download_delay and settings.getbool("RANDOMIZE_DOWNLOAD_DELAY"):
            # same policy as wget --random-wait
            self.random_delay_interval = (0.5 * self._download_delay, 1.5 * self._download_delay)
        else:
            self.random_delay_interval = None

        self.active = set()
        self.queue = []
        self.transferring = set()
        self.closing = False
        self.lastseen = 0
        self.next_request_calls = set()
Beispiel #29
0
 def __init__(self):
     self.maxdepth = settings.getint('DEPTH_LIMIT')
     self.stats = settings.getbool('DEPTH_STATS')
     if self.stats and self.maxdepth:
         stats.set_value('envinfo/request_depth_limit', self.maxdepth)
Beispiel #30
0
 def __init__(self):
     if not settings.getbool('REDIRECT_ENABLED'):
         raise NotConfigured
     self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY')
     self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
     self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
Beispiel #31
0
 def __init__(self):
     self.maxlength = settings.getint('URLLENGTH_LIMIT')
     if not self.maxlength:
         raise NotConfigured
Beispiel #32
0
 def __init__(self, engine):
     self.sites = {}
     self.spidermw = SpiderMiddlewareManager()
     self.itemproc = load_object(settings['ITEM_PROCESSOR'])()
     self.concurrent_items = settings.getint('CONCURRENT_ITEMS')
     self.engine = engine
Beispiel #33
0
 def __init__(self):
     self.max_retry_times = settings.getint('RETRY_TIMES')
     self.retry_http_codes = map(int, settings.getlist('RETRY_HTTP_CODES'))
     self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
  def parse_weibo(self, response):
    query = response.request.meta['query']
    start = datetime.strptime(response.request.meta['start'], "%Y-%m-%d %H:%M:%S")
    end = datetime.strptime(response.request.meta['end'], "%Y-%m-%d %H:%M:%S")
    range = daterange(start, end).delta()
    last_fetched = datetime.strptime(response.request.meta['last_fetched'], "%Y-%m-%d %H:%M:%S")

    jQuery = pq(response.body)
    scripts = jQuery('script')

    text = "".join(filter(lambda x: x is not None, [x.text for x in scripts]))
    # check if we exceed the sina limit
    sassfilter_match = re.search(r'{(\"pid\":\"pl_common_sassfilter\".*?)}', text, re.M | re.I)
    if sassfilter_match:
      raise CloseSpider('weibo search exceeded')

    # check the num of search results
    totalshow_match = re.search(r'{(\"pid\":\"pl_common_totalshow\".*?)}', text, re.M | re.I)
    if totalshow_match:
      html = json.loads(totalshow_match.group())['html']
      if len(html) == 0:
        raise CloseSpider('not login? %s' % html)
      totalshow = pq(html)
      if totalshow('div.topcon_l').html() is None:
        log.msg('%s 0 feeds' % query, level=log.INFO)
        return
      topcon_num = int(re.search('\s(\d+)\s', totalshow('div.topcon_l').text().replace(',', ''), re.I).group(1))
      log.msg('%s %d feeds' % (query, topcon_num), level=log.INFO)
      max_feeds = settings.getint('FEED_LIMIT', 200000)
      if topcon_num > max_feeds:
        log.msg('too much (%d) result for %s.' % (topcon_num, query), logLevel=log.WARNING)
      elif 1000 < topcon_num < max_feeds:
        # weibo search only allow 20 feeds on 1 page and at most 50 pages.
        days = range.days / float(2)
        middle = start + timedelta(days)

        # first part
        url = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), start, middle)
        request = Request(url=url, callback=self.parse_weibo)
        request.meta['query'] = query
        request.meta['start'] = start.strftime("%Y-%m-%d %H:%M:%S")
        request.meta['end'] = middle.strftime("%Y-%m-%d %H:%M:%S")
        request.meta['priority'] = days / 2
        request.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S")
        yield request

        # second part
        url2 = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), middle, end)
        request2 = Request(url=url2, callback=self.parse_weibo)
        request2.meta['query'] = query
        request2.meta['start'] = middle.strftime("%Y-%m-%d %H:%M:%S")
        request2.meta['end'] = end.strftime("%Y-%m-%d %H:%M:%S")
        request2.meta['priority'] = days / 2
        request2.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S")
        yield request2
      else:
        # check the feeds update
        feedlist_match = re.search(r'{(\"pid\":\"pl_weibo_feedlist\".*?)}', text, re.M | re.I)
        if feedlist_match:
          search_results = pq(json.loads(feedlist_match.group())['html'])
          feeds = search_results('dl.feed_list')
          search_pages = search_results('ul.search_page_M')
          pages = SearchPage.wrap(search_pages)

          # send the items to pipeline
          for feed in feeds:
            item = ScrapyWeiboItem()
            item['html'] = tostring(feed)
            yield item
            # skip first page and request other pages
          for i in xrange(2, len(pages)):
            query = pages[i]
            log.msg('%s' % query)
            request = Request(url=query, callback=self.parse_page)
            request.meta['query'] = query
            yield request
Beispiel #35
0
 def __init__(self):
     self.sites = {}
     self.handlers = DownloadHandlers()
     self.middleware = DownloaderMiddlewareManager.from_settings(settings)
     self.concurrent_spiders = settings.getint('CONCURRENT_SPIDERS')
Beispiel #36
0
 def __init__(self):
     self.sites = {}
     self.middleware = DownloaderMiddlewareManager()
     self.concurrent_spiders = settings.getint("CONCURRENT_SPIDERS")
Beispiel #37
0
from scrapy.selector import Selector
from scrapy.http import Request
#from scrapy.http import FormRequest
from scrapy.conf import settings
#from scrapy.shell import inspect_response

from scrapy.utils.response import get_base_url

from tase.HistorySpider import HistorySpider
from tase.items import TaseItem

import tase.common

PROCESS_HISTORY = settings.getbool('PROCESS_HISTORY', False)
HISTORY_PERIOD = settings.getint('HISTORY_PERIOD', 2)
category_fund = settings.get('CATEGORY_FUND')


class FundSpider(HistorySpider):
    name = 'funds'
    allowed_domains = ['tase.co.il']
    start_urls = ['http://www.tase.co.il/TASEEng/MarketData/MutualFunds/']

    rules = (
        Rule(SgmlLinkExtractor(allow=[r'BuildCmb_6_1.js']),
             callback='parse_fund_list'),
        Rule(SgmlLinkExtractor(allow=('FundMainData\.htm', )),
             callback='parse_fund'),
        Rule(SgmlLinkExtractor(allow=(r'MutualFunds', )),
             callback='parse_fund_search'),
Beispiel #38
0
from __future__ import with_statement

import cPickle as pickle

from scrapy.xlib.pydispatch import dispatcher

from scrapy.core.engine import scrapyengine
from scrapy.core.exceptions import NotConfigured
from scrapy.core import signals
from scrapy.utils.response import response_httprepr
from scrapy.stats import stats
from scrapy.http import Request
from scrapy import log
from scrapy.conf import settings

items_per_spider = settings.getint("ITEMSAMPLER_COUNT", 1)
close_spider = settings.getbool("ITEMSAMPLER_CLOSE_SPIDER", False)
max_response_size = settings.getbool("ITEMSAMPLER_MAX_RESPONSE_SIZE")


class ItemSamplerPipeline(object):
    def __init__(self):
        self.filename = settings["ITEMSAMPLER_FILE"]
        if not self.filename:
            raise NotConfigured
        self.items = {}
        self.spiders_count = 0
        self.empty_domains = set()
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
Beispiel #39
0
  def parse_weibo(self, response):
    query = response.request.meta['query']
    start = datetime.strptime(response.request.meta['start'], "%Y-%m-%d %H:%M:%S")
    end = datetime.strptime(response.request.meta['end'], "%Y-%m-%d %H:%M:%S")
    range = daterange(start, end).delta()
    last_fetched = datetime.strptime(response.request.meta['last_fetched'], "%Y-%m-%d %H:%M:%S")

    jQuery = pq(response.body)
    scripts = jQuery('script')

    text = "".join(filter(lambda x: x is not None, [x.text for x in scripts]))
    # check if we exceed the sina limit
    sassfilter_match = re.search(r'{(\"pid\":\"pl_common_sassfilter\".*?)}', text, re.M | re.I)
    if sassfilter_match:
      raise CloseSpider('weibo search exceeded')

    # check the num of search results
    totalshow_match = re.search(r'{(\"pid\":\"pl_common_totalshow\".*?)}', text, re.M | re.I)
    if totalshow_match:
      html = json.loads(totalshow_match.group())['html']
      if len(html) == 0:
        raise CloseSpider('not login? %s' % html)
      totalshow = pq(html)
      if totalshow('div.topcon_l').html() is None:
        log.msg('%s 0 feeds' % query, level=log.INFO)
        return
      topcon_num = int(re.search('\s(\d+)\s', totalshow('div.topcon_l').text().replace(',', ''), re.I).group(1))
      log.msg('%s %d feeds' % (query, topcon_num), level=log.INFO)
      max_feeds = settings.getint('FEED_LIMIT', 200000)
      if topcon_num > max_feeds:
        log.msg('too much (%d) result for %s.' % (topcon_num, query), logLevel=log.WARNING)
      elif 1000 < topcon_num < max_feeds:
        # weibo search only allow 20 feeds on 1 page and at most 50 pages.
        days = range.days / float(2)
        middle = start + timedelta(days)

        # first part
        url = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), start, middle)
        request = Request(url=url, callback=self.parse_weibo)
        request.meta['query'] = query
        request.meta['start'] = start.strftime("%Y-%m-%d %H:%M:%S")
        request.meta['end'] = middle.strftime("%Y-%m-%d %H:%M:%S")
        request.meta['priority'] = days / 2
        request.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S")
        yield request

        # second part
        url2 = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), middle, end)
        request2 = Request(url=url2, callback=self.parse_weibo)
        request2.meta['query'] = query
        request2.meta['start'] = middle.strftime("%Y-%m-%d %H:%M:%S")
        request2.meta['end'] = end.strftime("%Y-%m-%d %H:%M:%S")
        request2.meta['priority'] = days / 2
        request2.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S")
        yield request2
      else:
        # check the feeds update
        feedlist_match = re.search(r'{(\"pid\":\"pl_weibo_feedlist\".*?)}', text, re.M | re.I)
        if feedlist_match:
          search_results = pq(json.loads(feedlist_match.group())['html'])
          feeds = search_results('dl.feed_list')
          search_pages = search_results('ul.search_page_M')
          pages = SearchPage.wrap(search_pages)

          # send the items to pipeline
          for feed in feeds:
            item = ScrapyWeiboItem()
            item['html'] = tostring(feed)
            yield item
            # skip first page and request other pages
          for i in xrange(2, len(pages)):
            query = pages[i]
            log.msg('%s' % query)
            request = Request(url=query, callback=self.parse_page)
            request.meta['query'] = query
            yield request
Beispiel #40
0
 def __init__(self):
     if not settings.getbool('RETRY_ENABLED'):
         raise NotConfigured
     self.max_retry_times = settings.getint('RETRY_TIMES')
     self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
     self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
Beispiel #41
0
 def __init__(self):
     if not settings.getbool('RETRY_ENABLED'):
         raise NotConfigured
     self.max_retry_times = settings.getint('RETRY_TIMES')
     self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
     self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
Beispiel #42
0
 def __init__(self):
     self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY')
     self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
     self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
Beispiel #43
0
 def __init__(self):
     self.max_metarefresh_delay = settings.getint(
         'REDIRECT_MAX_METAREFRESH_DELAY')
     self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
     self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
Beispiel #44
0
from __future__ import with_statement

import cPickle as pickle

from scrapy.xlib.pydispatch import dispatcher

from scrapy.core.manager import scrapymanager
from scrapy.core.exceptions import NotConfigured
from scrapy.core import signals
from scrapy.utils.response import response_httprepr
from scrapy.stats import stats
from scrapy.http import Request
from scrapy import log
from scrapy.conf import settings

items_per_spider = settings.getint('ITEMSAMPLER_COUNT', 1)
close_spider = settings.getbool('ITEMSAMPLER_CLOSE_SPIDER', False)
max_response_size = settings.getbool('ITEMSAMPLER_MAX_RESPONSE_SIZE', )

class ItemSamplerPipeline(object):

    def __init__(self):
        self.filename = settings['ITEMSAMPLER_FILE']
        if not self.filename:
            raise NotConfigured
        self.items = {}
        self.spiders_count = 0
        self.empty_spiders = set()
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
Beispiel #45
0
 def __init__(self):
     self.sites = {}
     self.handlers = DownloadHandlers()
     self.middleware = DownloaderMiddlewareManager.from_settings(settings)
     self.concurrent_spiders = settings.getint('CONCURRENT_SPIDERS')
Beispiel #46
0
 def __init__(self):
     self.max_retry_times = settings.getint('RETRY_TIMES')
     self.retry_http_codes = map(int, settings.getlist('RETRY_HTTP_CODES'))
     self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')