Exemple #1
0
 def run(self, params):
     url = params['url']
     flag = params['flag']
     print(url)
     try:
         ps = crawl(url)
         if len(ps) > 2000:
             uid = store2pg(ps=ps, url=url, flag=flag)
         else:
             uid = None
         if uid:
             urls = re.compile(
                 '<a target="_blank" href="(http://www.feelcars.com/.*?html)">',
                 re.S).findall(ps)
             for u in urls:
                 url = u
                 log.info('入队列 jz_qckj_pagesource')
                 queue_job('main_qichetansuowang.Crawler2', {
                     'url': url,
                     'flag': flag
                 },
                           queue='jz_qckj_pagesource')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_qckj_pagesource')
         queue_job('main_qichetansuowang.Crawler1', {
             'url': url,
             'flag': flag
         },
                   queue='jz_qckj_pagesource')
Exemple #2
0
 def run(self, params):
     log.info("Getting context info...")
     return {
         "job_id": get_current_job().id,
         "worker_id": get_current_worker().id,
         "config": get_current_config()
     }
 def run(self, params):
     url = params['url']
     flag = params['flag']
     print(url)
     try:
         ps = crawl(url)
         if len(str(ps)) > 500:
             uid = store2pg(ps=ps, url=url, flag=flag)
         else:
             uid = None
         if uid:
             urls = list(
                 re.compile(
                     '<p class="title">.*?<a href="(http:.*?)" target="_blank">',
                     re.S).findall(ps))
             for u in urls:
                 url1 = u
                 log.info('入队列 jz_cj_pagesource')
                 queue_job('main_dongfangcaifu.Crawler2', {
                     'url': url1,
                     'flag': flag
                 },
                           queue='jz_cj_pagesource')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_cj_pagesource')
         queue_job('main_dongfangcaifu.Crawler1', {
             'url': url,
             'flag': flag
         },
                   queue='jz_cj_pagesource')
 def run(self, params):
     url = params['url']
     flag = params['flag']
     print(url)
     try:
         ps = crawl(url)
         print(len(ps))
         if len(ps) > 100:
             uid = store2pg(ps=ps, url=url, flag=flag)
         else:
             uid = None
         if uid:
             urls = re.findall('<a class="newsLink" href="(.*?)">', ps)
             for u in urls:
                 url = 'https://www.lynkco.com.cn' + u
                 log.info('入队列 jz_qymh_pagesource')
                 queue_job('main_lingkeqiche.Crawler2', {
                     'url': url,
                     'flag': flag
                 },
                           queue='jz_qymh_pagesource')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_qymh_pagesource')
         queue_job('main_lingkeqiche.Crawler1', {
             'url': url,
             'flag': flag
         },
                   queue='jz_qymh_pagesource')
Exemple #5
0
    def run(self, params):

        redis_key_started = Queue.redis_key_started()

        stats = {"fetched": 0, "requeued": 0}

        # Fetch all the jobs started more than a minute ago - they should not
        # be in redis:started anymore
        job_ids = connections.redis.zrangebyscore(redis_key_started, "-inf", time.time() - params.get("timeout", 60))

        # TODO this should be wrapped inside Queue or Worker
        # we shouldn't access these internals here
        queue_obj = Queue("default")
        unserialized_job_ids = queue_obj.unserialize_job_ids(job_ids)

        for i, job_id in enumerate(job_ids):

            queue = Job(unserialized_job_ids[i], start=False, fetch=False).fetch(full_data=True).data["queue"]

            queue_obj = Queue(queue)

            stats["fetched"] += 1

            log.info("Requeueing %s on %s" % (unserialized_job_ids[i], queue))

            # TODO LUA script & don't rpush if not in zset anymore.
            with connections.redis.pipeline(transaction=True) as pipeline:
                pipeline.zrem(redis_key_started, job_id)
                pipeline.rpush(queue_obj.redis_key, job_id)
                pipeline.execute()

            stats["requeued"] += 1

        return stats
Exemple #6
0
def main():

  parser = argparse.ArgumentParser(description='Runs a task')

  cfg = config.get_config(parser=parser, config_type="run")
  cfg["is_cli"] = True
  set_current_config(cfg)
  log.info(cfg)
  if len(cfg["taskargs"]) == 1:
    params = json.loads(cfg["taskargs"][0])
  else:
    params = {}

    # mrq-run taskpath a 1 b 2 => {"a": "1", "b": "2"}
    for group in utils.group_iter(cfg["taskargs"], n=2):
      if len(group) != 2:
        print "Number of arguments wasn't even"
        sys.exit(1)
      params[group[0]] = group[1]

  if cfg["async"]:
    ret = queue.send_task(cfg["taskpath"], params, sync=False, queue=cfg["queue"])
    print ret
  else:
    worker_class = load_class_by_path(cfg["worker_class"])
    job = worker_class.job_class(None)
    job.data = {
      "path": cfg["taskpath"],
      "params": params,
      "queue": cfg["queue"]
    }
    job.datestarted = datetime.datetime.utcnow()
    set_current_job(job)
    ret = job.perform()
    print json.dumps(ret)
Exemple #7
0
Fichier : io.py Projet : AshBT/mrq
    def run(self, params):

        log.info("I/O starting")
        ret = self._run(params)
        log.info("I/O finished")

        return ret
Exemple #8
0
 def run(self, params):
     log.info("Getting context info...")
     return {
         "job_id": get_current_job().id,
         "worker_id": get_current_worker().id,
         "config": get_current_config()
     }
Exemple #9
0
  def run(self, params):

    self.collection = connections.mongodb_jobs.mrq_jobs

    redis_key_started = Queue.redis_key_started()

    stats = {
      "fetched": 0,
      "requeued": 0
    }

    # Fetch all the jobs started more than a minute ago - they should not be in redis:started anymore
    job_ids = connections.redis.zrangebyscore(redis_key_started, "-inf", time.time() - params.get("timeout", 60))

    for job_id in job_ids:

      queue = Job(job_id, start=False, fetch=False).fetch(full_data=True).data["queue"]

      stats["fetched"] += 1

      log.info("Requeueing %s on %s" % (job_id, queue))

      # TODO LUA script & don't rpush if not in zset anymore.
      with connections.redis.pipeline(transaction=True) as pipeline:
        pipeline.zrem(redis_key_started, job_id)
        pipeline.rpush(Queue(queue).redis_key, job_id)
        pipeline.execute()

      stats["requeued"] += 1

    return stats
 def run(self, params):
     url = params['url']
     flag = params['flag']
     print(url)
     try:
         ps = crawl(url)
         print(len(ps))
         if len(ps) > 10:
             uid = store2pg(ps=ps, url=url, flag=flag)
         else:
             uid = None
         if uid:
             urls = re.findall('"supdata_whereid":"(.*?)"', ps)
             for u in urls:
                 url = 'http://www.changan.com.cn/news-details.shtml?whereid=%s&column_id=98' % u
                 log.info('入队列 jz_qymh_pagesource')
                 queue_job('main_changanqiche.Crawler2', {
                     'url': url,
                     'flag': flag
                 },
                           queue='jz_qymh_pagesource')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_qymh_pagesource')
         queue_job('main_changanqiche.Crawler1', {
             'url': url,
             'flag': flag
         },
                   queue='jz_qymh_pagesource')
Exemple #11
0
    def run(self, params):

        log.info("I/O starting")
        ret = self._run(params)
        log.info("I/O finished")

        return ret
 def run(self, params):
     url = params['url']
     flag = params['flag']
     try:
         info = sess1.query(Jz_dongfangcaifu_content).filter_by(
             url=url, channel_name=flag).first()
         print(1)
         info_2 = sess.query(Jz_dongfangcaifu_PageSource).filter_by(
             url=url).first()
         print(2)
         if not info and info_2:
             ps = info_2.pagesource
             ps_uid = info_2.uid
             author = re.findall('data-source="(.*?)">', ps)
             author = author[0] if author else None
             public_time = re.findall('<div class="time">(.*?)</div>', ps)
             public_time = public_time[0].replace('年', '-').replace(
                 '月', '-').replace('日', '') if public_time else None
             content = re.compile('<!--文章主体-->(.*?)<!--原文标题-->',
                                  re.S).findall(ps)
             content2 = content[0] if content else None
             if content2:
                 pic = re.findall('<img src="(https.*?)"', content2)
                 pic = ';'.join(pic)
                 content2 = content2.replace(
                     '<img src',
                     '[img src').replace('" />',
                                         '" /]').replace('</p>', '\n')
             else:
                 pic = ''
             content = re.sub('<.*?>', '', content2.replace(
                 '&nbsp;', '')).replace('\t', '').replace(' ', '').replace(
                     '\u3000', '').replace('\n\n', '\n').replace(
                         '本文版权为电动汽车网-电动邦所有,欢迎转载但请务必注明来源。', '').strip()
             title = re.findall('<h1>(.*?)</h1>', ps)[0]
             #                tag = re.findall('<a class="fn-left".*?target="_blank">(.*?)</a>',ps)
             #                tag = ' '.join(tag)
             hid = store2pg_parse(url=url,
                                  author=author,
                                  public_time=public_time,
                                  page_source=ps_uid,
                                  content=content,
                                  website_name='东方财富网',
                                  channel_name=flag,
                                  title=title,
                                  topic=None,
                                  tag=None,
                                  meta_keywords=None,
                                  pic=pic,
                                  flag=None)
             if hid:
                 print('完成')
         else:
             print('新闻解析已存在')
     except Exception as e:
         print(e)
         if e != "'NoneType' object has no attribute 'replace'":
             print('重新入队')
             log.info('入队列 jz_cj_parse')
Exemple #13
0
    def run(self, params):

        # If there are more than this much items on the queue, we don't try to check if our mongodb
        # jobs are still queued.
        max_queue_items = params.get("max_queue_items", 1000)

        stats = {"fetched": 0, "requeued": 0}

        all_queues = Queue.all_known()

        for queue_name in all_queues:

            queue = Queue(queue_name)
            queue_size = queue.size()

            if queue.is_raw:
                continue

            log.info("Checking queue %s" % queue_name)

            if queue_size > max_queue_items:
                log.info("Stopping because queue %s has %s items" %
                         (queue_name, queue_size))
                continue

            queue_jobs_ids = set(queue.list_job_ids(limit=max_queue_items + 1))
            if len(queue_jobs_ids) >= max_queue_items:
                log.info(
                    "Stopping because queue %s actually had more than %s items"
                    % (queue_name, len(queue_jobs_ids)))
                continue

            for job_data in connections.mongodb_jobs.mrq_jobs.find(
                {
                    "queue": queue_name,
                    "status": "queued"
                },
                    projection={
                        "_id": 1
                    }).sort([["_id", 1]]):

                stats["fetched"] += 1

                if str(job_data["_id"]) in queue_jobs_ids:
                    log.info("Found job %s on queue %s. Stopping" %
                             (job_data["_id"], queue.id))
                    break

                # At this point, this job is not on the queue and we're sure
                # the queue is less than max_queue_items
                # We can safely requeue the job.
                log.info("Requeueing %s on %s" % (job_data["_id"], queue.id))

                stats["requeued"] += 1
                job = Job(job_data["_id"])
                job.requeue(queue=queue_name)

        return stats
Exemple #14
0
    def run(self, params):
        log.info("adding", params)
        res = params.get("a", 0) + params.get("b", 0)

        if params.get("sleep", 0):
            log.info("sleeping", params.get("sleep", 0))
            time.sleep(params.get("sleep", 0))

        return res
Exemple #15
0
 def run(self, params):
     log.info("pipeline..........")
     # {'processor': item.processor, 'request': item.request}
     pipeline = params.get('pipeline', None)
     result = params.get('result', None)
     if pipeline is not None:
         clazz = PIPEINE_MAP.get(pipeline)
         clazz().process_item(result)
         log.info('--------------------complete')
Exemple #16
0
    def run(self, params):
        log.info("adding", params)
        res = params.get("a", 0) + params.get("b", 0)

        if params.get("sleep", 0):
            log.info("sleeping %d", params.get("sleep", 0))
            time.sleep(params.get("sleep", 0))

        return res
Exemple #17
0
    def run(self, params):

        # If there are more than this much items on the queue, we don't try to check if our mongodb
        # jobs are still queued.
        max_queue_items = params.get("max_queue_items", 1000)

        stats = {
            "fetched": 0,
            "requeued": 0
        }

        all_queues = Queue.all_known()

        for queue_name in all_queues:

            queue = Queue(queue_name)
            queue_size = queue.size()

            if queue.is_raw:
                continue

            log.info("Checking queue %s" % queue_name)

            if queue_size > max_queue_items:
                log.info("Stopping because queue %s has %s items" %
                         (queue_name, queue_size))
                continue

            queue_jobs_ids = set(queue.list_job_ids(limit=max_queue_items + 1))
            if len(queue_jobs_ids) >= max_queue_items:
                log.info(
                    "Stopping because queue %s actually had more than %s items" %
                    (queue_name, len(queue_jobs_ids)))
                continue

            for job_data in connections.mongodb_jobs.mrq_jobs.find({
                "queue": queue_name,
                "status": "queued"
            }, projection={"_id": 1}).sort([["_id", 1]]):

                stats["fetched"] += 1

                if str(job_data["_id"]) in queue_jobs_ids:
                    log.info("Found job %s on queue %s. Stopping" % (job_data["_id"], queue.id))
                    break

                # At this point, this job is not on the queue and we're sure
                # the queue is less than max_queue_items
                # We can safely requeue the job.
                log.info("Requeueing %s on %s" % (job_data["_id"], queue.id))

                stats["requeued"] += 1
                job = Job(job_data["_id"])
                job.requeue(queue=queue_name)

        return stats
Exemple #18
0
    def run(self, params):

        log.info("Will abort this task")

        connections.mongodb_jobs.tests_inserts.insert(params)
        try:
            raise InAbortException
        except InAbortException:
            abort_current_job()

        raise Exception("Should not be reached")
Exemple #19
0
    def run(self, params):

        log.info("Retrying in %s on %s" %
                 (params.get("delay"), params.get("queue")))

        connections.mongodb_jobs.tests_inserts.insert(params)

        retry_current_job(queue=params.get("queue"),
                          delay=params.get("delay"),
                          max_retries=params.get("max_retries"))

        raise Exception("Should not be reached")
Exemple #20
0
  def run(self, params):

    log.info("Retrying in %s on %s" % (params.get("countdown"), params.get("queue")))

    connections.mongodb_logs.tests_inserts.insert(params)

    if params.get("cancel_on_retry"):
      self.cancel_on_retry = params.get("cancel_on_retry")

    retry_current_job(queue=params.get("queue"), countdown=params.get("countdown"))

    raise Exception("Should not be reached")
Exemple #21
0
 def run(self, params):
     log.info("crawl..........%s")
     # {'processor': item.processor, 'request': item.request}
     processor = params.get('processor', None)
     request = params.get('request', None)
     if processor is not None:
         clazz = load_class('processors', processor)
         processor_instance = clazz()
         if request is not None:
             request = request_from_dict(request, processor_instance)
             # print(request)
             processor_instance.set_start_requests([request])
         SpiderCore(processor_instance, time_sleep=1).start()
         log.info('****************complete')
Exemple #22
0
    def run(self, params):
        key_name = params["key_name"]
        log.info("Opening file to extract exif for %s", key_name)

        #Use exifread libary to extract exif data from image file
        f = open(key_name)
        exif_data = process_file(f, details=False)
        log.info("Extracted exif data")

        #Delete the file
        f.close()
        os.remove(key_name)

        #Only extract data needed from libary call to store in database
        tags = {}
        for field_name in exif_data:
            field = exif_data[field_name]
            tags[field_name] = {
                'printable': str(field),
                'tag': field.tag,
                'field_type': field.field_type,
                'field_length': field.field_length,
                'values': str(field.values)
            }

        #Store dictionary of tags into mongodb instance
        log.info("Inserting tags into db")
        exif_store.insert_one(tags)
        log.info("Successfully inserted tags into db")
Exemple #23
0
    def run(self, params):

        log.info("Retrying in %s on %s" %
                 (params.get("delay"), params.get("queue")))

        connections.mongodb_jobs.tests_inserts.insert(params)

        retry_current_job(
            queue=params.get("queue"),
            delay=params.get("delay"),
            max_retries=params.get("max_retries")
        )

        raise Exception("Should not be reached")
Exemple #24
0
 def run(self, params):
     #初始化的时候,选用页面长度为30
     #每日增量,只要一页就可以
     url_s1 = ('http://www.feelcars.com/category/xinnengyuan/page/%s',
               '新能源')
     end = 4
     for i in range(1, end):
         url = url_s1[0] % str(i)
         log.info('入队列 jz_qckj_pagesource')
         queue_job('main_qichetansuowang.Crawler1', {
             'url': url,
             'flag': url_s1[-1]
         },
                   queue='jz_qckj_pagesource')
Exemple #25
0
 def run(self, params):
     url = params.get('ext', '').strip()
     topic = params.get('topic', '').strip()
     message = params.get('message', '').strip()
     ext_url = urlencode({'topic': topic, 'message': message})
     if '?' not in url:
         url = url + '?' + ext_url
     else:
         url = url + ext_url if url.endswith('&') else url + '&' + ext_url
     log.info('HTTP GET %s' % (url, ))
     res = requests.get(url)
     if res.ok:
         return res.content
     else:
         retry_current_job()
Exemple #26
0
    def run(self, params):
        host = params.get('h', '').strip()
        port = params.get('p', 80)
        hkey = '%s:%d' % (host, port)
        if _check_t(params):
            abort_current_job()

        connections.redis.sadd(CONF_DATA_ALL_KEY, hkey)
        tmp = connections.redis.hget(CONF_DATA_RANK_KEY, hkey)
        now_num = int(tmp) if tmp else 0
        if not now_num:
            connections.redis.hincrby(CONF_DATA_RANK_KEY, hkey, 1)
        test = CONF_CHECK_PROXY_FUNC(host, port)
        test and log.info('CHECK OK proxy:%s, num:%d' % (hkey, now_num))
        if test:
            if now_num <= 0:
                now_num = 1 if connections.redis.sismember(
                    CONF_DATA_OK_KEY, hkey) else 10
                connections.redis.hset(CONF_DATA_RANK_KEY, hkey, now_num)
            elif 0 < now_num < 20:
                connections.redis.hincrby(CONF_DATA_RANK_KEY, hkey, 1)
                now_num += 1
        else:
            if now_num >= -10:
                connections.redis.hincrby(CONF_DATA_RANK_KEY, hkey, -1)
            now_num -= 1

        if now_num > 0:
            connections.redis.sadd(CONF_DATA_OK_KEY, hkey)
        else:
            connections.redis.srem(CONF_DATA_OK_KEY, hkey)
            now_num <= -10 and connections.redis.srem(CONF_DATA_ALL_KEY, hkey)
        return {'proxy': hkey, 'num': now_num, 'test': test}
 def run(self, params):
     params1 = (
         'https://www.lynkco.com.cn/Brand/News/NewsMore?pageIndex=%s',
         '新闻潮讯')
     #初始化的时候,选用页面长度为30
     for u in [params1]:
         if u[-1] == '新闻潮讯':
             end = 8
         for i in range(1, end):
             url = u[0] % str(i)
             log.info('入队列 jz_qymh_pagesource')
             queue_job('main_lingkeqiche.Crawler1', {
                 'url': url,
                 'flag': u[-1]
             },
                       queue='jz_qymh_pagesource')
Exemple #28
0
    def run(self, params):

        key_name = params["key_name"]
        _, extension = os.path.splitext(key_name)
        if (extension == ".jpg"):
            conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY)
            bucket = conn.get_bucket(params["bucket_name"])
            key = bucket.get_key(key_name)
            key.get_contents_to_filename(key_name)
            log.info("Succesfully downloaded file from s3 bucket %s", key_name)
            queue_job("tasks.Write", {"key_name": key_name}, queue=write_queue)
        else:
            #TODO handle compressed and other file types
            log.warn(
                "Currently unable to handle file extension type for file %s",
                key_name)
            os.remove(key_name)
    def run(self, params):
        params1 = (
            'https://www.changan.com.cn/news-changan?page=%s&year=%s&keyword=&type=0&ajax_req=1&t=1584689024944',
            '长安动态')
        params2 = ('http://www.changan.com.cn/company.shtml', '合资合作')

        #初始化的时候,选用页面长度为30
        for u in [params1]:
            if u[-1] == '长安动态':
                year = datetime.datetime.now().strftime('%Y')
                #每周一次,一次一页(8篇)
                for page in range(1, 2):
                    url = u[0] % (str(page), str(year))
                    log.info('入队列 jz_qymh_pagesource')
                    queue_job('main_changanqiche.Crawler1', {
                        'url': url,
                        'flag': u[-1]
                    },
                              queue='jz_qymh_pagesource')
Exemple #30
0
def crawl(url):
    '''抓取网页源码pageSource'''
    header = header = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        #                'Host': 'www.caam.org.cn',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
        #                'Cookie': '__xwaf_id=e52bf9be294d90397354ab6d12a689eaa6fbbfae12e1338e0bac7e8b2b179e78; __xwaf_browser_auth=BkPxoimjrj8Xi8GRntg8Lw==; __xwaf_filter_key=57fdc8dd987c1627'
    }
    session = requests.session()
    try:
        ipusing = get_proxy_redis()
        ipusing = str(ipusing, encoding='utf-8')
        #        _proxy = {'http':'http://%s'%ipusing,'https':'https://%s'%ipusing}
        log.info('now using %s' % ipusing)
        data = session.get(url, headers=header, timeout=30)
        print("%s's status_code is %s" %
              (url, data.status_code))  # 打印相关url 的状态码
        if data.status_code == 200:
            data.encoding = data.apparent_encoding
            pageSource = data.text
            data.close()
            return pageSource
        elif data.status_code == 404:
            return '404'
    except Exception:
        pass
    finally:
        time.sleep(random.uniform(0, 2))
 def run(self, params):
     #初始化的时候,选用页面长度为30
     #每日增量,只要一页就可以
     url_s1 = ('http://finance.eastmoney.com/news/cjjsp_%s.html', '经济时评')
     url_s2 = ('http://finance.eastmoney.com/news/cgnjj_%s.html', '国内经济')
     url_s3 = ('http://finance.eastmoney.com/news/cgjjj_%s.html', '国际经济')
     #        for i in range(1,4):
     for i in range(1, 26):
         url1 = url_s1[0] % str(i)
         log.info('入队列 jz_cj_pagesource')
         queue_job('main_dongfangcaifu.Crawler1', {
             'url': url1,
             'flag': url_s1[-1]
         },
                   queue='jz_cj_pagesource')
         url2 = url_s2[0] % str(i)
         log.info('入队列 jz_cj_pagesource')
         queue_job('main_dongfangcaifu.Crawler1', {
             'url': url2,
             'flag': url_s2[-1]
         },
                   queue='jz_cj_pagesource')
         url3 = url_s3[0] % str(i)
         log.info('入队列 jz_cj_pagesource')
         queue_job('main_dongfangcaifu.Crawler1', {
             'url': url3,
             'flag': url_s3[-1]
         },
                   queue='jz_cj_pagesource')
Exemple #32
0
    def run(self, params):

        collection = connections.mongodb_jobs.mrq_jobs

        # If there are more than this much items on the queue, we don't try to check if our mongodb
        # jobs are still queued.
        max_queue_items = params.get("max_queue_items", 1000)

        stats = {"fetched": 0, "requeued": 0}

        for job_data in collection.find({
                "status": "queued"
        },
                                        fields={
                                            "_id": 1,
                                            "queue": 1
                                        }).sort([("_id", 1)]):

            stats["fetched"] += 1

            queue = Queue(job_data["queue"])
            queue_size = queue.size()
            if queue_size > max_queue_items:
                log.info("Stopping because queue %s has %s items" %
                         (queue, queue_size))
                break

            queue_jobs_ids = set(queue.list_job_ids(limit=max_queue_items + 1))
            if len(queue_jobs_ids) >= max_queue_items:
                log.info(
                    "Stopping because queue %s actually had more than %s items"
                    % (queue, len(queue_jobs_ids)))
                break

            if str(job_data["_id"]) in queue_jobs_ids:
                log.info("Stopping because we found job %s in redis" %
                         job_data["_id"])
                break

            # At this point, this job is not on the queue and we're sure
            # the queue is less than max_queue_items
            # We can safely requeue the job.
            log.info("Requeueing %s on %s" % (job_data["_id"], queue.id))

            stats["requeued"] += 1
            job = Job(job_data["_id"])
            job.requeue(queue=job_data["queue"])

        return stats
Exemple #33
0
    def run(self, params):

        redis_key_started = Queue.redis_key_started()

        stats = {
            "fetched": 0,
            "requeued": 0
        }

        # Fetch all the jobs started more than a minute ago - they should not
        # be in redis:started anymore
        job_ids = connections.redis.zrangebyscore(
            redis_key_started, "-inf", time.time() - params.get("timeout", 60))

        # TODO this should be wrapped inside Queue or Worker
        # we shouldn't access these internals here
        queue_obj = Queue("default")
        unserialized_job_ids = queue_obj.unserialize_job_ids(job_ids)

        for i, job_id in enumerate(job_ids):

            queue = Job(unserialized_job_ids[i], start=False, fetch=False).fetch(
                full_data=True).data["queue"]

            queue_obj = Queue(queue)

            stats["fetched"] += 1

            log.info("Requeueing %s on %s" % (unserialized_job_ids[i], queue))

            # TODO LUA script & don't rpush if not in zset anymore.
            with connections.redis.pipeline(transaction=True) as pipeline:
                pipeline.zrem(redis_key_started, job_id)
                pipeline.rpush(queue_obj.redis_key, job_id)
                pipeline.execute()

            stats["requeued"] += 1

        return stats
def crawl(url):
    '''抓取网页源码pageSource'''
    header = get_header()
    session = requests.session()
    try:
        ipusing = get_proxy_redis()
        ipusing = str(ipusing, encoding='utf-8')
        #        _proxy = {'http':'http://%s'%ipusing,'https':'https://%s'%ipusing}
        log.info('now using %s' % ipusing)
        data = session.get(url, headers=header, timeout=30)
        print("%s's status_code is %s" %
              (url, data.status_code))  # 打印相关url 的状态码
        if data.status_code == 200:
            data.encoding = data.apparent_encoding
            pageSource = data.text
            data.close()
            return pageSource
        elif data.status_code == 404:
            return 404
    except Exception:
        pass
    finally:
        time.sleep(random.uniform(0, 2))
Exemple #35
0
 def run(self, params):
     url = params['url']
     flag = params['flag']
     try:
         info = sess.query(Jz_qichetansuowang_PageSource).filter_by(
             url=url).first()
         sess.rollback()
         if not info:
             ps = crawl(url)
             if len(str(ps)) > 2000:
                 uid = store2pg(ps=ps, url=url, flag=flag)
             else:
                 uid = None
             if uid:
                 log.info('入队列 jz_qckj_parse')
                 queue_job('main_qichetansuowang.Parse', {
                     'url': url,
                     'flag': flag
                 },
                           queue='jz_qckj_parse')
         else:
             print('新闻已存在,并入解析')
             queue_job('main_qichetansuowang.Parse', {
                 'url': url,
                 'flag': flag
             },
                       queue='jz_qckj_parse')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_qckj_pagesource')
         queue_job('main_qichetansuowang.Crawler2', {
             'url': url,
             'flag': flag
         },
                   queue='jz_qckj_pagesource')
Exemple #36
0
    def run(self, params):

        # Some systems may be configured like this.
        if not PY3 and params.get("utf8_sys_stdout"):
            import codecs
            import sys
            UTF8Writer = codecs.getwriter('utf8')
            sys.stdout = UTF8Writer(sys.stdout)
        if params["class_name"] == "unicode":
            log.info(u"caf\xe9")
        elif params["class_name"] == "string":
            log.info("cafe")
        elif params["class_name"] == "latin-1":
            log.info("caf\xe9")
        elif params["class_name"] == "bytes1":
            log.info("Mat\xc3\xa9riels d'entra\xc3\xaenement")

        return True
Exemple #37
0
    def run(self, params):

        # Some systems may be configured like this.
        if params.get("utf8_sys_stdout"):
            import codecs
            import sys
            UTF8Writer = codecs.getwriter('utf8')
            sys.stdout = UTF8Writer(sys.stdout)

        if params["class_name"] == "unicode":
            log.info(u"caf\xe9")
        elif params["class_name"] == "string":
            log.info("cafe")
        elif params["class_name"] == "latin-1":
            log.info("caf\xe9")
        elif params["class_name"] == "bytes1":
            log.info("Mat\xc3\xa9riels d'entra\xc3\xaenement")

        return True
Exemple #38
0
  def run(self, params):

    self.collection = connections.mongodb_jobs.mrq_jobs

    # If there are more than this much items on the queue, we don't try to check if our mongodb
    # jobs are still queued.
    max_queue_items = params.get("max_queue_items", 1000)

    stats = {
      "fetched": 0,
      "requeued": 0
    }

    for job_data in self.collection.find({
      "status": "queued"
    }, fields={"_id": 1, "queue": 1}).sort([("_id", 1)]):

      stats["fetched"] += 1

      queue = Queue(job_data["queue"])
      queue_size = queue.size()
      if queue_size > max_queue_items:
        log.info("Stopping because queue %s has %s items" % (queue, queue_size))
        break

      queue_jobs_ids = set(queue.list_job_ids(limit=max_queue_items + 1))
      if len(queue_jobs_ids) >= max_queue_items:
        log.info("Stopping because queue %s actually had more than %s items" % (queue, len(queue_jobs_ids)))
        break

      if str(job_data["_id"]) in queue_jobs_ids:
        log.info("Stopping because we found job %s in redis" % job_data["_id"])
        break

      # At this point, this job is not on the queue and we're sure the queue is less than max_queue_items
      # We can safely requeue the job.
      log.info("Requeueing %s on %s" % (job_data["_id"], queue.id))

      stats["requeued"] += 1
      job = Job(job_data["_id"])
      job.requeue(queue=job_data["queue"])

    return stats
Exemple #39
0
    def run(self, params):
        filename = params.get('f', '').strip()
        if _check_t(params):
            abort_current_job()

        timer_num = 3
        timer_seq = CONF_CHECK_INTERVAL

        gql = pyfile.load_str(filename).strip()
        if not gql:
            abort_current_job()

        proxy_list, gret = run_gdom_page(
            gql,
            get_proxy=lambda: connections.redis.srandmember(CONF_DATA_OK_KEY))
        proxy_list and log.info('FETCH OK filename:%s, num:%d' %
                                (filename, len(proxy_list)))
        if gret.errors:
            log.error('FETCH ERROR filename:%s, errors:%s' %
                      (filename, gret.errors))

        if not proxy_list:
            log.error('FETCH EMPTY filename:%s, gret:%r' % (filename, gret))
            abort_current_job()

        timestamp = int(time.time())
        task_map = {}
        for proxy_str in proxy_list:
            host = proxy_str.split(':', 1)[0]
            port = int(proxy_str.split(':', 1)[1])
            for t_idx in range(timer_num):
                next_tick = timestamp + pyutils.crc32_mod(
                    proxy_str, timer_seq) + t_idx * timer_seq
                rawparam = '%s#%d#%d#%d' % (host, port, timer_seq,
                                            int(next_tick / timer_seq))
                task_map.setdefault(rawparam, next_tick)

        queue_raw_jobs('check_proxy_timed_set', task_map)
        return {
            'file': filename,
            'num': len(proxy_list),
            'proxy_list': proxy_list
        }
 def run(self, params):
     url = params['url']
     flag = params['flag']
     try:
         info = sess.query(Jz_dongfangcaifu_PageSource).filter_by(
             url=url).first()
         sess.rollback()
         if not info:
             ps = crawl(url)
             if len(str(ps)) > 500 and '返回' not in str(ps):
                 uid = store2pg(ps=ps, url=url, flag=flag)
             else:
                 uid = None
             if uid:
                 log.info('入队列 jz_cj_parse')
                 queue_job('main_dongfangcaifu.Parse', {
                     'url': url,
                     'flag': flag
                 },
                           queue='jz_cj_parse')
         else:
             print('新闻已存在')
             log.info('入队列 jz_cj_parse')
             queue_job('main_dongfangcaifu.Parse', {
                 'url': url,
                 'flag': flag
             },
                       queue='jz_cj_parse')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_cj_pagesource')
         queue_job('main_dongfangcaifu.Crawler2', {
             'url': url,
             'flag': flag
         },
                   queue='jz_cj_pagesource')
 def run(self, params):
     url = params['url']
     flag = params['flag']
     try:
         info = sess.query(Jz_lingkeqiche_PageSource).filter_by(
             url=url).first()
         if not info:
             ps = crawl(url)
             if len(ps) > 100:
                 uid = store2pg(ps=ps, url=url, flag=flag)
             else:
                 uid = None
             if uid:
                 log.info('入队列 jz_qymh_parse')
                 queue_job('main_lingkeqiche.Parse', {
                     'url': url,
                     'flag': flag
                 },
                           queue='jz_qymh_parse')
         else:
             print('新闻已存在')
             log.info('入队列 jz_qymh_parse')
             queue_job('main_lingkeqiche.Parse', {
                 'url': url,
                 'flag': flag
             },
                       queue='jz_qymh_parse')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_qymh_pagesource')
         queue_job('main_lingkeqiche.Crawler2', {
             'url': url,
             'flag': flag
         },
                   queue='jz_qymh_pagesource')
Exemple #42
0
    def run(self, params):
        params = params['post_data']
        # params = params['data']
        log.info(params)
        # 项目是否已经存在
        same_job = has_same_job(params)
        log.info('has same job ?', same_job)
        if same_job:
            if same_job['status'] == FINISH_STATUS and same_job[
                    'url'] != '' and same_job['total_price'] != '':
                # 拷贝一份
                same_job['new_guid'] = copy_same_job(same_job, params)
                return {
                    'data': same_job,
                    'message': 'has the same job',
                    'status': 0
                }

        # 添加新任务
        job_guid = insert_new_job(params)

        self.connect()
        from packing.models import Project, PackDetail

        # 查看是否有重复计算
        project = Project.objects.filter(data_input=params['data']).last()
        if project:
            log.info('has the same project data')
            total_price = 0
            all_products = project.products.all()
            if project.comment != params['comment']:
                # 描述不一样,新增一个
                project.comment = params['project_comment']

                project.pk = None
                project.save()
                for product in all_products:
                    total_price += total_price + product.total_price
                    project.products.add(product)
            else:
                for product in all_products:
                    total_price += total_price + product.total_price

            url = '%s/product_detail/%d' % (HOST_URL, project.id)
            # 更新任务状态
            update_job_status(job_guid,
                              FINISH_STATUS,
                              url=url,
                              price=total_price)
            insert_job_result(job_guid, all_products)

            return {
                'data': {
                    'project_id': project.id,
                    'url': url,
                    'price': total_price
                },
                'message': 'the project had been done',
                'status': 0
            }

        res = shape_use(params)
        if res['is_error']:
            log.error(res['error_info'])

            update_job_status(job_guid, res['error_info'])
            return {'data': '', 'status': 10, 'message': res['error_info']}
        else:
            # 保存结果
            # 更新任务中间状态
            # update_job_status(job_guid, u'正在保存结果')
            log.info('saving the result into project')
            try:
                project, total_price = save_project(Project, PackDetail,
                                                    res['data'], params)
            except Exception as e:
                log.error(e)
                # 更新任务失败状态
                update_job_status(job_guid, u'保存结果失败')
                return {
                    'data': res,
                    'message': 'error in save the result into project',
                    'status': 100
                }

            log.info('update job status and finish')
            # 完结任务状态
            url = '%s/product_detail/%d' % (HOST_URL, project.id)
            # 更新任务状态
            update_job_status(job_guid,
                              FINISH_STATUS,
                              url=url,
                              price=total_price)
            insert_job_result(job_guid, project.products.all())

            if project:
                res['new_project_id'] = project.id
                res['total_price'] = total_price

            return {'data': res, 'message': 'OK', 'status': 0}