def parse(self, response):
    print 'do parse function'
    if response.body.find('feedBackUrlCallBack') != -1:
      data = json.loads(re.search(r'feedBackUrlCallBack\((.*?)\)', response.body, re.I).group(1))
      userinfo = data.get('userinfo', '')
      if len(userinfo):
        log.msg('user id %s' % userinfo['userid'], level=log.INFO)
        assert userinfo['userid'] == self.username
        self.logined = True

        bootstrap = settings.get('BOOTSTRAP')
        log.msg('bootstrap from %s' % bootstrap, level=log.INFO)
        # FIXME: use last scheduled time instead of today, otherwise queue filter will not work
        today = datetime.now()
        if bootstrap == 'file':
          lines = tuple(codecs.open('items.txt', 'r', 'utf-8'))
          for line in lines:
            if line.startswith("#"):
              continue
            start = _epoch()
            url = QueryFactory.create_timerange_query(urllib.quote(line.encode('utf8')), start, today)
            request = Request(url=url, callback=self.parse_weibo, meta={
              'query': line,
              'start': start.strftime("%Y-%m-%d %H:%M:%S"),
              'end': today.strftime("%Y-%m-%d %H:%M:%S"),
              'last_fetched': today.strftime("%Y-%m-%d %H:%M:%S")})
            yield request
      else:
        self.log('login failed: errno=%s, reason=%s' % (data.get('errno', ''), data.get('reason', '')))
Exemple #2
0
  def parse(self, response):
    if response.body.find('feedBackUrlCallBack') != -1:
      data = json.loads(re.search(r'feedBackUrlCallBack\((.*?)\)', response.body, re.I).group(1))
      userinfo = data.get('userinfo', '')
      if len(userinfo):
        log.msg('user id %s' % userinfo['userid'], level=log.INFO)
        assert userinfo['userid'] == self.username
        self.logined = True

        bootstrap = settings.get('BOOTSTRAP')
        log.msg('bootstrap from %s' % bootstrap, level=log.INFO)
        # FIXME: use last scheduled time instead of today, otherwise queue filter will not work
        today = datetime.now()
        if bootstrap == 'file':
          lines = tuple(codecs.open('items.txt', 'r', 'utf-8'))
          for line in lines:
            if line.startswith("#"):
              continue
            start = _epoch()
            url = QueryFactory.create_timerange_query(urllib.quote(line.encode('utf8')), start, today)
            request = Request(url=url, callback=self.parse_weibo, meta={
              'query': line,
              'start': start.strftime("%Y-%m-%d %H:%M:%S"),
              'end': today.strftime("%Y-%m-%d %H:%M:%S"),
              'last_fetched': today.strftime("%Y-%m-%d %H:%M:%S")})
            yield request
      else:
        self.log('login failed: errno=%s, reason=%s' % (data.get('errno', ''), data.get('reason', '')))
    def parse(self, response):
        if response.body.find("feedBackUrlCallBack") != -1:
            userinfo = json.loads(re.search(r"feedBackUrlCallBack\((.*?)\)", response.body, re.I).group(1))["userinfo"]
            log.msg("user id %s" % userinfo["userid"], level=log.INFO)
            assert userinfo["userid"] == self.username
            self.logined = True

            bootstrap = settings.get("BOOTSTRAP")
            log.msg("bootstrap from %s" % bootstrap, level=log.INFO)
            # FIXME: use last scheduled time instead of today, otherwise queue filter will not work
            today = datetime.now()
            if bootstrap == "file":
                lines = tuple(codecs.open("items.txt", "r", "utf-8"))
                for line in lines:
                    if line.startswith("#"):
                        continue
                    start = _epoch()
                    url = QueryFactory.create_timerange_query(urllib.quote(line.encode("utf8")), start, today)
                    request = Request(url=url, callback=self.parsel_weibo)
                    request.meta["query"] = line
                    request.meta["start"] = start.strftime("%Y-%m-%d %H:%M:%S")
                    request.meta["end"] = today.strftime("%Y-%m-%d %H:%M:%S")
                    request.meta["last_fetched"] = today.strftime("%Y-%m-%d %H:%M:%S")
                    yield request
  def parse(self, response):
    if response.body.find('feedBackUrlCallBack') != -1:
      userinfo = json.loads(re.search(r'feedBackUrlCallBack\((.*?)\)', response.body, re.I).group(1))['userinfo']
      log.msg('user id %s' % userinfo['userid'], level=log.INFO)
      assert userinfo['userid'] == self.username
      self.logined = True

      bootstrap = settings.get('BOOTSTRAP')
      log.msg('bootstrap from %s' % bootstrap, level=log.INFO)
      # FIXME: use last scheduled time instead of today, otherwise queue filter will not work
      today = datetime.now()
      if bootstrap == 'file':
        lines = tuple(codecs.open('items.txt', 'r', 'utf-8'))
        for line in lines:
          if line.startswith("#"):
            continue
          start = _epoch()
          url = QueryFactory.create_timerange_query(urllib.quote(line.encode('utf8')), start, today)
          request = Request(url=url, callback=self.parsel_weibo)
          request.meta['query'] = line
          request.meta['start'] = start.strftime("%Y-%m-%d %H:%M:%S")
          request.meta['end'] = today.strftime("%Y-%m-%d %H:%M:%S")
          request.meta['last_fetched'] = today.strftime("%Y-%m-%d %H:%M:%S")
          yield request
  def parse_weibo(self, response):
    query = response.request.meta['query']
    start = datetime.strptime(response.request.meta['start'], "%Y-%m-%d %H:%M:%S")
    end = datetime.strptime(response.request.meta['end'], "%Y-%m-%d %H:%M:%S")
    range = daterange(start, end).delta()
    last_fetched = datetime.strptime(response.request.meta['last_fetched'], "%Y-%m-%d %H:%M:%S")

    jQuery = pq(response.body)
    scripts = jQuery('script')

    text = "".join(filter(lambda x: x is not None, [x.text for x in scripts]))
    # check if we exceed the sina limit
    sassfilter_match = re.search(r'{(\"pid\":\"pl_common_sassfilter\".*?)}', text, re.M | re.I)
    if sassfilter_match:
      raise CloseSpider('weibo search exceeded')

    # check the num of search results
    totalshow_match = re.search(r'{(\"pid\":\"pl_common_totalshow\".*?)}', text, re.M | re.I)
    if totalshow_match:
      html = json.loads(totalshow_match.group())['html']
      if len(html) == 0:
        raise CloseSpider('not login? %s' % html)
      totalshow = pq(html)
      if totalshow('div.topcon_l').html() is None:
        log.msg('%s 0 feeds' % query, level=log.INFO)
        return
      topcon_num = int(re.search('\s(\d+)\s', totalshow('div.topcon_l').text().replace(',', ''), re.I).group(1))
      log.msg('%s %d feeds' % (query, topcon_num), level=log.INFO)
      max_feeds = settings.getint('FEED_LIMIT', 200000)
      if topcon_num > max_feeds:
        log.msg('too much (%d) result for %s.' % (topcon_num, query), logLevel=log.WARNING)
      elif 1000 < topcon_num < max_feeds:
        # weibo search only allow 20 feeds on 1 page and at most 50 pages.
        days = range.days / float(2)
        middle = start + timedelta(days)

        # first part
        url = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), start, middle)
        request = Request(url=url, callback=self.parse_weibo)
        request.meta['query'] = query
        request.meta['start'] = start.strftime("%Y-%m-%d %H:%M:%S")
        request.meta['end'] = middle.strftime("%Y-%m-%d %H:%M:%S")
        request.meta['priority'] = days / 2
        request.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S")
        yield request

        # second part
        url2 = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), middle, end)
        request2 = Request(url=url2, callback=self.parse_weibo)
        request2.meta['query'] = query
        request2.meta['start'] = middle.strftime("%Y-%m-%d %H:%M:%S")
        request2.meta['end'] = end.strftime("%Y-%m-%d %H:%M:%S")
        request2.meta['priority'] = days / 2
        request2.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S")
        yield request2
      else:
        # check the feeds update
        feedlist_match = re.search(r'{(\"pid\":\"pl_weibo_feedlist\".*?)}', text, re.M | re.I)
        if feedlist_match:
          search_results = pq(json.loads(feedlist_match.group())['html'])
          feeds = search_results('dl.feed_list')
          search_pages = search_results('ul.search_page_M')
          pages = SearchPage.wrap(search_pages)

          # send the items to pipeline
          for feed in feeds:
            item = ScrapyWeiboItem()
            item['html'] = tostring(feed)
            yield item
            # skip first page and request other pages
          for i in xrange(2, len(pages)):
            query = pages[i]
            log.msg('%s' % query)
            request = Request(url=query, callback=self.parse_page)
            request.meta['query'] = query
            yield request
Exemple #6
0
  def parse_weibo(self, response):
    query = response.request.meta['query']
    start = datetime.strptime(response.request.meta['start'], "%Y-%m-%d %H:%M:%S")
    end = datetime.strptime(response.request.meta['end'], "%Y-%m-%d %H:%M:%S")
    range = daterange(start, end).delta()
    last_fetched = datetime.strptime(response.request.meta['last_fetched'], "%Y-%m-%d %H:%M:%S")

    jQuery = pq(response.body)
    scripts = jQuery('script')

    text = "".join(filter(lambda x: x is not None, [x.text for x in scripts]))
    # check if we exceed the sina limit
    sassfilter_match = re.search(r'{(\"pid\":\"pl_common_sassfilter\".*?)}', text, re.M | re.I)
    if sassfilter_match:
      raise CloseSpider('weibo search exceeded')

    # check the num of search results
    totalshow_match = re.search(r'{(\"pid\":\"pl_common_totalshow\".*?)}', text, re.M | re.I)
    if totalshow_match:
      html = json.loads(totalshow_match.group())['html']
      if len(html) == 0:
        raise CloseSpider('not login? %s' % html)
      totalshow = pq(html)
      if totalshow('div.topcon_l').html() is None:
        log.msg('%s 0 feeds' % query, level=log.INFO)
        return
      topcon_num = int(re.search('\s(\d+)\s', totalshow('div.topcon_l').text().replace(',', ''), re.I).group(1))
      log.msg('%s %d feeds' % (query, topcon_num), level=log.INFO)
      max_feeds = settings.getint('FEED_LIMIT', 200000)
      if topcon_num > max_feeds:
        log.msg('too much (%d) result for %s.' % (topcon_num, query), logLevel=log.WARNING)
      elif 1000 < topcon_num < max_feeds:
        # weibo search only allow 20 feeds on 1 page and at most 50 pages.
        days = range.days / float(2)
        middle = start + timedelta(days)

        # first part
        url = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), start, middle)
        request = Request(url=url, callback=self.parse_weibo)
        request.meta['query'] = query
        request.meta['start'] = start.strftime("%Y-%m-%d %H:%M:%S")
        request.meta['end'] = middle.strftime("%Y-%m-%d %H:%M:%S")
        request.meta['priority'] = days / 2
        request.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S")
        yield request

        # second part
        url2 = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), middle, end)
        request2 = Request(url=url2, callback=self.parse_weibo)
        request2.meta['query'] = query
        request2.meta['start'] = middle.strftime("%Y-%m-%d %H:%M:%S")
        request2.meta['end'] = end.strftime("%Y-%m-%d %H:%M:%S")
        request2.meta['priority'] = days / 2
        request2.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S")
        yield request2
      else:
        # check the feeds update
        feedlist_match = re.search(r'{(\"pid\":\"pl_weibo_feedlist\".*?)}', text, re.M | re.I)
        if feedlist_match:
          search_results = pq(json.loads(feedlist_match.group())['html'])
          feeds = search_results('dl.feed_list')
          search_pages = search_results('ul.search_page_M')
          pages = SearchPage.wrap(search_pages)

          # send the items to pipeline
          for feed in feeds:
            item = ScrapyWeiboItem()
            item['html'] = tostring(feed)
            yield item
            # skip first page and request other pages
          for i in xrange(2, len(pages)):
            query = pages[i]
            log.msg('%s' % query)
            request = Request(url=query, callback=self.parse_page)
            request.meta['query'] = query
            yield request