Exemple #1
0
class Recrawl(object):
  def __init__(self, servers, key, shard_dist, crawler):
    """Initialize Recrawler

    Parameters
    ----------
    server : Redis instance
    key : str
        Where to store fingerprints
    """
    self.queues = QueueManager()
    self.crawler = crawler
    data_queue = StrDataQueueCluster(servers, key, DataSet, shard_dist)
    self.queues.add(RECRAWL_LIST_TAG, data_queue)

  @classmethod
  def from_crawler(cls, crawler):
    servers = connection.from_settings(crawler.settings)
    key = crawler.settings.get('RECRAWL_LIST_KEY', RECRAWL_KEY)
    shard_dist = crawler.settings.get('RECRAWL_SHARD_DIST', RECRAWL_SHARD_DIST)
    recrawl = cls(servers, key, shard_dist, crawler)
    crawler.signals.connect(
        recrawl.setup_recrawl, signal=scrapy.signals.spider_opened)
    return recrawl

  def setup_recrawl(self, spider):
    self.crawler.signals.connect(
        self.recrawl, signal=le_crawler.signals.hourly_timeout)

  def recrawl(self):
    recrawl_list = list(self.queues.list_members(RECRAWL_LIST_TAG))
    for url in recrawl_list:
      req = Request(url, dont_filter=True)
      self.queues.push(PRIORITY_QUEUE_TAG, req)
def add(request,channel_id):
    clock = Clock(logger=logger)
    clock.start()
    queue = QueueManager(channel = channel_id)

    # Remove non-printable chars
    element = request.GET['element']
    creator = __get_client_ip(request)
    url = filter(lambda x: x in string.printable,element)

    match = re.search('.*[w][a][t][c][h].[v][=]([^/,&]*)',url)
    if match:
        queue.add(url=match.group(1),creator=creator)
        logger.info('Added '+url)
        logger.info("add returned in %f seconds" % clock.stop())
        return HttpResponse(1)
    else:
        logger.critical('Error! URL Invalid '+url)
        logger.info("add returned in %f seconds" % clock.stop())
        return HttpResponse(0)
class Scheduler(object):
  """Redis-based scheduler"""

  def __init__(self, servers, persist, input_queue_key, input_queue_cls,
               input_queue_shard_dist, output_queue_key, output_queue_cls,
               output_queue_shard_dist, priority_queue_key,
               priority_queue_cls, priority_queue_shard_dist, recrawl_key,
               dupefilter_key, dupe_filter_ins, idle_before_close):
    """Initialize scheduler.

        Parameters
        ----------
        servers : list of Redis instance
        persist : bool
        queue_key : str
        queue_cls : queue class
        dupe_filter_cls : dupefilter class
        dupefilter_key : str
        idle_before_close : int
        """
    self.persist = persist
    self.input_queue_key = input_queue_key
    self.input_queue_cls = input_queue_cls
    self.input_queue_shard_dist = input_queue_shard_dist
    self.output_queue_key = output_queue_key
    self.output_queue_cls = output_queue_cls
    self.output_queue_shard_dist = output_queue_shard_dist
    self.priority_queue_key = priority_queue_key
    self.priority_queue_cls = priority_queue_cls
    self.priority_queue_shard_dist = priority_queue_shard_dist
    self.dupefilter_key = dupefilter_key
    self.df = dupe_filter_ins
    self.recrawl_key = recrawl_key
    self.idle_before_close = idle_before_close
    self.stats = None
    self.servers = servers
    self.queues = QueueManager()
    self.url_normalize = UrlNormalize.get_instance()

  def __len__(self):
    return self.queues.len(PRIORITY_QUEUE_TAG) + \
           self.queues.len(OUTPUT_QUEUE_TAG)

  @classmethod
  def from_settings(cls, settings):
    persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
    input_queue_key = settings.get(
      'INPUT_QUEUE_KEY', INPUT_QUEUE_KEY)
    input_queue_cls = load_object(settings.get(
      'INPUT_QUEUE_CLASS', INPUT_QUEUE_CLASS))
    input_queue_shard_dist = settings.get(
      'INPUT_QUEUE_SHARD_DIST', INPUT_QUEUE_SHARD_DIST)
    output_queue_key = settings.get(
      'OUTPUT_QUEUE_KEY', OUTPUT_QUEUE_KEY)
    output_queue_cls = load_object(settings.get(
      'OUTPUT_QUEUE_CLASS', OUTPUT_QUEUE_CLASS))
    output_queue_shard_dist = settings.get(
      'OUTPUT_QUEUE_SHARD_DIST', OUTPUT_QUEUE_SHARD_DIST)
    priority_queue_key = settings.get(
      'PRIORITY_QUEUE_KEY', PRIORITY_QUEUE_KEY)
    priority_queue_cls = load_object(settings.get(
      'PRIORITY_QUEUE_CLASS', PRIORITY_QUEUE_CLASS))
    priority_queue_shard_dist = settings.get(
      'PRIORITY_QUEUE_SHARD_DIST', PRIORITY_QUEUE_SHARD_DIST)
    dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
    idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE)
    servers = connection.from_settings(settings)
    dupefilter_ins = load_object(
      settings['DUPEFILTER_CLASS']).from_settings(settings)
    recrawl_key = settings.get('RECRAWL_LIST_KEY', RECRAWL_KEY)
    return cls(servers, persist, input_queue_key, input_queue_cls,
               input_queue_shard_dist, output_queue_key, output_queue_cls,
               output_queue_shard_dist, priority_queue_key,
               priority_queue_cls, priority_queue_shard_dist, recrawl_key,
               dupefilter_key, dupefilter_ins, idle_before_close)

  @classmethod
  def from_crawler(cls, crawler):
    instance = cls.from_settings(crawler.settings)
    # FIXME: for now, stats are only supported from this constructor
    instance.stats = crawler.stats
    return instance

  def open(self, spider):
    self.spider = spider
    input_queue = CachedRequestQueueCluster(
      self.servers,
      self.input_queue_key,
      self.input_queue_cls,
      self.input_queue_shard_dist,
      self.spider)
    output_queue = CachedRequestQueueCluster(
      self.servers,
      self.output_queue_key,
      self.output_queue_cls,
      self.output_queue_shard_dist,
      self.spider)
    priority_queue = RequestQueueCluster(
      self.servers,
      self.priority_queue_key,
      self.priority_queue_cls,
      self.priority_queue_shard_dist,
      self.spider)
    self.queues.add(INPUT_QUEUE_TAG, input_queue)
    self.queues.add(OUTPUT_QUEUE_TAG, output_queue)
    self.queues.add(PRIORITY_QUEUE_TAG, priority_queue)
    if self.idle_before_close < 0:
      self.idle_before_close = 0
    # notice if there are requests already in the queue to resume the crawl
    if len(input_queue):
      spider.log("Resuming crawl (%d requests scheduled)" % len(input_queue))
    if isinstance(self.df, RFPDupeFilter):
      self.df.set_spider(spider)

  def close(self, reason):
    if not self.persist:
      self.df.clear()
      self.queues.clear(INPUT_QUEUE_TAG)
      self.queues.clear(OUTPUT_QUEUE_TAG)
      self.queues.clear(PRIORITY_QUEUE_TAG)

  def enqueue_request(self, request):
    if not request:
      return
    # TODO(Xiaohe): move url normalize to some better place
    # process request, url normalize
    # some place we dont need normalize url in process request or response
    tmpurl = self.url_normalize.get_unique_url(request.url)
    if not tmpurl:
      raise Exception('Bad request url:%s' % request.url)
      return
    new_meta = request.meta.copy() or {}
    new_meta['Rawurl'] = request.url
    nrequest = request.replace(url=tmpurl, meta=new_meta)
    if not request.dont_filter and self.df.request_seen(request):
      return
    if self.stats:
      self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
    self.queues.push(INPUT_QUEUE_TAG, nrequest)

  def next_request(self):
    block_pop_timeout = self.idle_before_close
    request = self.queues.pop(PRIORITY_QUEUE_TAG, block_pop_timeout)
    if request is None:
      request = self.queues.pop(OUTPUT_QUEUE_TAG, block_pop_timeout)
    if request and self.stats:
      self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
    if request and not request.meta.has_key('Rawurl'):
      tmpurl = self.url_normalize.get_unique_url(request.url)
      if not tmpurl:
        raise Exception('Bad request url:%s' % request.url)
      nrequest = request.replace(url=tmpurl)
      return nrequest
    return request


  def has_pending_requests(self):
    return len(self) > 0
class TestSequenceFunctions(TestCase):
    def setUp(self):
        user = User.objects.create(username='******')
        user.save()

        channel = Channel(creator=user)
        channel.save()

        self.queue = QueueManager(channel=channel.id)

    def test_db_connection(self):
        print "Test 1"
        self.assertIsNotNone(self.queue.get_db())

    def test_check_initial_emptiness(self):
        print "Test 2"
        self.assertEqual(len(self.queue.getQueue()), 0)

    def test_add_item(self):
        print "Test 2"

        self.queue.add(url="tGiEsjtfJdg", creator="127.0.0.1")
        self.assertEqual(len(self.queue.getQueue()), 1)

    def test_rm_item(self):
        print "Test 3"

        self.queue.rm(url="tGiEsjtfJdg")
        self.assertEqual(len(self.queue.getQueue()), 0)

    def test_uniqueness(self):
        print "Test 4"

        self.queue.add(url="tGiEsjtfJdg", creator="127.0.0.1")
        self.queue.add(url="tGiEsjtfJdg", creator="127.0.0.1")
        self.queue.add(url="tGiEsjtfJdg", creator="127.0.0.2")
        self.assertEqual(len(self.queue.getQueue()), 1)

    def test_next_video(self):
        print "Test 5"

        self.queue.add(url="tGiEsjtfJdg", creator="127.0.0.1")
        self.queue.add(url="XFwVfrAURDg", creator="127.0.0.1")
        self.queue.add(url="EfuVcRdamCY", creator="127.0.0.1")
        self.queue.add(url="4pRPAbCwgSs", creator="127.0.0.1")
        count = 0
        added = []

        self.assertEqual(len(self.queue.getQueue()), 4)
        self.assertEqual(self.queue.next(), "tGiEsjtfJdg")
        self.assertEqual(len(self.queue.getQueue()), 3)
        self.assertEqual(self.queue.next(), "XFwVfrAURDg")
        self.assertEqual(len(self.queue.getQueue()), 2)
        self.assertEqual(self.queue.next(), "EfuVcRdamCY")
        self.assertEqual(len(self.queue.getQueue()), 1)
        self.assertEqual(self.queue.next(), "4pRPAbCwgSs")
        self.assertEqual(len(self.queue.getQueue()), 0)
        self.assertIsNone(self.queue.next())

    def test_votes(self):
        print "Test 6"
        added = []

        # Asserts that it cant register a vote to something that isn't there
        self.assertFalse(
            self.queue.register_vote(url="dummy",
                                     positive=1,
                                     negative=0,
                                     creator="127.0.0.1"))

        # Asserts votes for queues of a single item
        self.assertEqual(self.queue.add(url="tGiEsjtfJdg",
                                        creator="127.0.0.1"),
                         (True, True))  #1,0
        self.assertEqual(self.queue.add(url="tGiEsjtfJdg",
                                        creator="127.0.0.1"),
                         (True, False))  #1,0
        elements = self.queue.getQueue()
        for element in elements:
            self.assertIsNotNone(element)
            self.assertEqual(element.get("positive"), 1)
            self.assertEqual(element.get("negative"), 0)

        self.assertTrue(
            self.queue.register_vote(
                url="tGiEsjtfJdg",  #2,0
                positive=1,
                negative=0,
                creator="127.0.0.2"))
        self.assertFalse(
            self.queue.register_vote(
                url="tGiEsjtfJdg",  #2,0
                positive=1,
                negative=0,
                creator="127.0.0.2"))

        elements = self.queue.getQueue()
        self.assertEqual(len(elements), 1)

        element = [x for x in elements if x.get("url") == "tGiEsjtfJdg"][0]
        self.assertIsNotNone(element)
        self.assertEqual(element.get("positive"), 2)
        self.assertEqual(element.get("negative"), 0)

        # Asserts votes for bigger queues
        self.queue.add(url="XFwVfrAURDg", creator="127.0.0.1")
        self.queue.add(url="EfuVcRdamCY", creator="127.0.0.1")
        self.queue.add(url="4pRPAbCwgSs", creator="127.0.0.1")

        self.assertIsNotNone(
            self.queue.register_vote(url="tGiEsjtfJdg",
                                     positive=0,
                                     negative=1,
                                     creator="127.0.0.2"))
        self.assertIsNotNone(
            self.queue.register_vote(url="XFwVfrAURDg",
                                     positive=1,
                                     negative=0,
                                     creator="127.0.0.2"))
        self.assertIsNotNone(
            self.queue.register_vote(url="EfuVcRdamCY",
                                     positive=1,
                                     negative=0,
                                     creator="127.0.0.2"))
        self.assertIsNotNone(
            self.queue.register_vote(url="4pRPAbCwgSs",
                                     positive=1,
                                     negative=0,
                                     creator="127.0.0.2"))

        elements = self.queue.getQueue()
        self.assertEqual(len(elements), 4)

        for element in elements:
            if element.get("url") == "tGiEsjtfJdg":
                self.assertEqual(element.get("positive"), 1)
                self.assertEqual(element.get("negative"), 1)
            elif element.get("url") == "XFwVfrAURDg":
                self.assertEqual(element.get("positive"), 2)
                self.assertEqual(element.get("negative"), 0)
            elif element.get("url") == "4pRPAbCwgSs":
                self.assertEqual(element.get("positive"), 2)
                self.assertEqual(element.get("negative"), 0)