Ejemplo n.º 1
0
 def run(self, params):
     url = params['url']
     flag = params['flag']
     print(url)
     try:
         ps = crawl(url)
         print(len(ps))
         if len(ps) > 10:
             uid = store2pg(ps=ps, url=url, flag=flag)
         else:
             uid = None
         if uid:
             urls = re.findall('"supdata_whereid":"(.*?)"', ps)
             for u in urls:
                 url = 'http://www.changan.com.cn/news-details.shtml?whereid=%s&column_id=98' % u
                 log.info('入队列 jz_qymh_pagesource')
                 queue_job('main_changanqiche.Crawler2', {
                     'url': url,
                     'flag': flag
                 },
                           queue='jz_qymh_pagesource')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_qymh_pagesource')
         queue_job('main_changanqiche.Crawler1', {
             'url': url,
             'flag': flag
         },
                   queue='jz_qymh_pagesource')
Ejemplo n.º 2
0
 def run(self, params):
     url = params['url']
     flag = params['flag']
     print(url)
     try:
         ps = crawl(url)
         if len(ps) > 2000:
             uid = store2pg(ps=ps, url=url, flag=flag)
         else:
             uid = None
         if uid:
             urls = re.compile(
                 '<a target="_blank" href="(http://www.feelcars.com/.*?html)">',
                 re.S).findall(ps)
             for u in urls:
                 url = u
                 log.info('入队列 jz_qckj_pagesource')
                 queue_job('main_qichetansuowang.Crawler2', {
                     'url': url,
                     'flag': flag
                 },
                           queue='jz_qckj_pagesource')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_qckj_pagesource')
         queue_job('main_qichetansuowang.Crawler1', {
             'url': url,
             'flag': flag
         },
                   queue='jz_qckj_pagesource')
Ejemplo n.º 3
0
 def run(self, params):
     url = params['url']
     flag = params['flag']
     print(url)
     try:
         ps = crawl(url)
         if len(str(ps)) > 500:
             uid = store2pg(ps=ps, url=url, flag=flag)
         else:
             uid = None
         if uid:
             urls = list(
                 re.compile(
                     '<p class="title">.*?<a href="(http:.*?)" target="_blank">',
                     re.S).findall(ps))
             for u in urls:
                 url1 = u
                 log.info('入队列 jz_cj_pagesource')
                 queue_job('main_dongfangcaifu.Crawler2', {
                     'url': url1,
                     'flag': flag
                 },
                           queue='jz_cj_pagesource')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_cj_pagesource')
         queue_job('main_dongfangcaifu.Crawler1', {
             'url': url,
             'flag': flag
         },
                   queue='jz_cj_pagesource')
Ejemplo n.º 4
0
 def run(self, params):
     url = params['url']
     flag = params['flag']
     print(url)
     try:
         ps = crawl(url)
         print(len(ps))
         if len(ps) > 100:
             uid = store2pg(ps=ps, url=url, flag=flag)
         else:
             uid = None
         if uid:
             urls = re.findall('<a class="newsLink" href="(.*?)">', ps)
             for u in urls:
                 url = 'https://www.lynkco.com.cn' + u
                 log.info('入队列 jz_qymh_pagesource')
                 queue_job('main_lingkeqiche.Crawler2', {
                     'url': url,
                     'flag': flag
                 },
                           queue='jz_qymh_pagesource')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_qymh_pagesource')
         queue_job('main_lingkeqiche.Crawler1', {
             'url': url,
             'flag': flag
         },
                   queue='jz_qymh_pagesource')
Ejemplo n.º 5
0
 def requeue_job(self, params, fpath=None, nqueue=None):
     '''
     requeue an unfinished job
     run_wrapped的helper function
     '''
     log.warning('Job Failed, re-queue...%s' % params['url'])
     queue_job(fpath, params, queue=nqueue)
Ejemplo n.º 6
0
 def run(self, params):
     users = [{"user": x} for x in get_distinct_users()]
     print "Queue users"
     queue_jobs("bluebirdlib.tasks.GetUserTweets", users, queue="tweets")
     hts = [{"search": x} for x in HASHTAGS]
     print "Queue hashtags"
     queue_jobs("bluebirdlib.tasks.getHashTagTweets", hts, queue="tweets")
     print "Queue scheduler"
     queue_job("bluebirdlib.tasks.Scheduler", {}, queue="tweets")
     return 0
Ejemplo n.º 7
0
def initialize_jobs(bucket_name):
    setup_context()
    jobs_count = 0
    conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY)
    bucket = Bucket(connection=conn, name=bucket_name)
    for key in bucket.list():
        queue_job("tasks.Download", {
            "bucket_name": bucket_name,
            "key_name": key.key
        },
                  queue=download_queue)
        jobs_count += 1
    return jobs_count
Ejemplo n.º 8
0
 def run(self, params):
     #初始化的时候,选用页面长度为30
     #每日增量,只要一页就可以
     url_s1 = ('http://www.feelcars.com/category/xinnengyuan/page/%s',
               '新能源')
     end = 4
     for i in range(1, end):
         url = url_s1[0] % str(i)
         log.info('入队列 jz_qckj_pagesource')
         queue_job('main_qichetansuowang.Crawler1', {
             'url': url,
             'flag': url_s1[-1]
         },
                   queue='jz_qckj_pagesource')
Ejemplo n.º 9
0
def main():

    parser = argparse.ArgumentParser(description='Runs a task')

    cfg = config.get_config(parser=parser, config_type="run", sources=("file", "env", "args"))
    cfg["is_cli"] = True
    set_current_config(cfg)

    if len(cfg["taskargs"]) == 1:
        params = json.loads(cfg["taskargs"][0])  # pylint: disable=no-member
    else:
        params = {}

        # mrq-run taskpath a 1 b 2 => {"a": "1", "b": "2"}
        for group in utils.group_iter(cfg["taskargs"], n=2):
            if len(group) != 2:
                print("Number of arguments wasn't even")
                sys.exit(1)
            params[group[0]] = group[1]

    if cfg["queue"]:
        ret = queue_job(cfg["taskpath"], params, queue=cfg["queue"])
        print(ret)
    else:
        worker_class = load_class_by_path(cfg["worker_class"])
        job = worker_class.job_class(None)
        job.set_data({
            "path": cfg["taskpath"],
            "params": params,
            "queue": cfg["queue"]
        })
        job.datestarted = datetime.datetime.utcnow()
        set_current_job(job)
        ret = job.perform()
        print(json_stdlib.dumps(ret, cls=MongoJSONEncoder))  # pylint: disable=no-member
Ejemplo n.º 10
0
def api_job_action():
    params = {k: v for k, v in iteritems(request.form)}
    if params.get("status") and "-" in params.get("status"):
        params["status"] = params.get("status").split("-")
    return jsonify({"job_id": queue_job("mrq.basetasks.utils.JobAction",
                                        params,
                                        queue=get_current_config()["dashboard_queue"])})
Ejemplo n.º 11
0
Archivo: app.py Proyecto: benjisg/mrq
def api_job_action():
    params = {k: v for k, v in request.form.iteritems()}
    if params.get("status") and "-" in params.get("status"):
        params["status"] = params.get("status").split("-")
    return jsonify({"job_id": queue_job("mrq.basetasks.utils.JobAction",
                                        params,
                                        queue=get_current_config()["dashboard_queue"])})
Ejemplo n.º 12
0
 def run(self, params):
     params1 = (
         'https://www.lynkco.com.cn/Brand/News/NewsMore?pageIndex=%s',
         '新闻潮讯')
     #初始化的时候,选用页面长度为30
     for u in [params1]:
         if u[-1] == '新闻潮讯':
             end = 8
         for i in range(1, end):
             url = u[0] % str(i)
             log.info('入队列 jz_qymh_pagesource')
             queue_job('main_lingkeqiche.Crawler1', {
                 'url': url,
                 'flag': u[-1]
             },
                       queue='jz_qymh_pagesource')
Ejemplo n.º 13
0
    def run(self, params):

        key_name = params["key_name"]
        _, extension = os.path.splitext(key_name)
        if (extension == ".jpg"):
            conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY)
            bucket = conn.get_bucket(params["bucket_name"])
            key = bucket.get_key(key_name)
            key.get_contents_to_filename(key_name)
            log.info("Succesfully downloaded file from s3 bucket %s", key_name)
            queue_job("tasks.Write", {"key_name": key_name}, queue=write_queue)
        else:
            #TODO handle compressed and other file types
            log.warn(
                "Currently unable to handle file extension type for file %s",
                key_name)
            os.remove(key_name)
Ejemplo n.º 14
0
def api_queue_job(task):  #存在参数相互依赖的关系  无法使用 ApiSchemaWrapper
    queue = request.args.get('queue', '').strip()
    args = request.args.to_dict()
    params, err = fixTaskParams(MRQ_TASK_DICT, task, args)
    rst = ApiErrorBuild()
    rst = {"job_id": queue_job(task, params, queue=queue if queue else None)} if err is None else err

    return jsonify(rst)
Ejemplo n.º 15
0
 def exec_push_work(self, url):
     # import subprocess
     # modify arguments
     # 這些是會傳下去給spider的函數 除了spider需要用的之外,還包括所有栈名以及其任務的路徑
     # 如果有更改栈名請於此更改
     args = {
         'url': url,
         'spiderTask': 'spider.spider.__Spider',
         'spiderqueue': 'crawl_posts',
         'parseTask': 'parser.parse_posts.__Parser',
         'parsequeue': 'parse_posts'
     }
     # task = ['spider.spider_crawl.LcSpider']
     # command = ['mrq-run'] + task + args
     # '--queue', 'crawl_posts'
     # subprocess.Popen(command)
     queue_job(args['spiderTask'], args, queue=args['spiderqueue'])
Ejemplo n.º 16
0
    def run(self, params):
        params1 = (
            'https://www.changan.com.cn/news-changan?page=%s&year=%s&keyword=&type=0&ajax_req=1&t=1584689024944',
            '长安动态')
        params2 = ('http://www.changan.com.cn/company.shtml', '合资合作')

        #初始化的时候,选用页面长度为30
        for u in [params1]:
            if u[-1] == '长安动态':
                year = datetime.datetime.now().strftime('%Y')
                #每周一次,一次一页(8篇)
                for page in range(1, 2):
                    url = u[0] % (str(page), str(year))
                    log.info('入队列 jz_qymh_pagesource')
                    queue_job('main_changanqiche.Crawler1', {
                        'url': url,
                        'flag': u[-1]
                    },
                              queue='jz_qymh_pagesource')
Ejemplo n.º 17
0
 def _crawl(self, batch):
     responses = self._downloader.download(batch)
     if self._time_sleep:
         time.sleep(self._time_sleep)
     for response in responses:
         self._processor.logger.info(response)
         callback = response.request.callback(response)
         if isinstance(callback, types.GeneratorType):
             pipe = self._queue.get_pipe()
             for item in callback:
                 if isinstance(item, Request):
                     # logger.info("push request to queue..." + str(item))
                     if self._should_follow(item):
                         self._queue.push_pipe(item, pipe)
                 elif isinstance(item, PipeItem):
                     # 如果返回对象是pipeItem,则用对应的pipeline处理
                     self._process_count += 1
                     for pipe_name in item.pipe_names:
                         queue_job(PIPELINE_TASK,
                                   PipelineArgs(pipe_name, item.result),
                                   queue=PIPELINE)
                     if self.test:
                         if self._process_count > 0:
                             return
                 elif isinstance(
                         item,
                         Violet):  # 如果返回的是tuple,即详情页的processor和详情页的请求信息
                     queue_job(CRAWLER_TASK,
                               CrawlArgs(item.processor, item.request),
                               queue=CRAWLER)
                 else:
                     raise Exception('not return correct value!!!')
             pipe.execute()
         elif isinstance(callback, Request):
             # logger.info("push request to queue..." + str(back))
             if self._should_follow(callback):
                 self._queue.push(callback)
         elif isinstance(callback, PipeItem):
             # 如果返回对象是pipeItem,则用对应的pipeline处理
             self._process_count += 1
             for pipe_name in callback.pipe_names:
                 queue_job(PIPELINE_TASK,
                           PipelineArgs(pipe_name, callback.result),
                           queue=PIPELINE)
         elif isinstance(callback,
                         Violet):  # 如果返回的是tuple,即详情页的processor和详情页的请求信息
             queue_job(CRAWLER_TASK,
                       CrawlArgs(item.processor, item.request),
                       queue=CRAWLER)
         else:
             # # 如果返回对象不是pipeItem,则默认用每个pipeline处理
             raise Exception('not return correct value!!!')
Ejemplo n.º 18
0
def wait_for_job(path, params, **kwargs):
    job_id = queue_job(path, params, **kwargs)

    while True:
        time.sleep(5)
        res = get_job_result(job_id)
        if res["status"] == "success":
            return res.get("result")
        elif res["status"] not in ["queued", "started", "interrupt"]:
            raise Exception("Job %s was in status %s" %
                            (path, res.get("status")))
Ejemplo n.º 19
0
def wait_for_job(path, params, **kwargs):
    job_id = queue_job(path, params, **kwargs)

    while True:
        time.sleep(5)
        res = get_job_result(job_id)
        if res["status"] == "success":
            return res.get("result")
        elif res["status"] not in ["queued", "started", "interrupt"]:
            raise Exception("Job %s was in status %s" % (
                path, res.get("status")
            ))
Ejemplo n.º 20
0
 def run(self, params):
     #初始化的时候,选用页面长度为30
     #每日增量,只要一页就可以
     url_s1 = ('http://finance.eastmoney.com/news/cjjsp_%s.html', '经济时评')
     url_s2 = ('http://finance.eastmoney.com/news/cgnjj_%s.html', '国内经济')
     url_s3 = ('http://finance.eastmoney.com/news/cgjjj_%s.html', '国际经济')
     #        for i in range(1,4):
     for i in range(1, 26):
         url1 = url_s1[0] % str(i)
         log.info('入队列 jz_cj_pagesource')
         queue_job('main_dongfangcaifu.Crawler1', {
             'url': url1,
             'flag': url_s1[-1]
         },
                   queue='jz_cj_pagesource')
         url2 = url_s2[0] % str(i)
         log.info('入队列 jz_cj_pagesource')
         queue_job('main_dongfangcaifu.Crawler1', {
             'url': url2,
             'flag': url_s2[-1]
         },
                   queue='jz_cj_pagesource')
         url3 = url_s3[0] % str(i)
         log.info('入队列 jz_cj_pagesource')
         queue_job('main_dongfangcaifu.Crawler1', {
             'url': url3,
             'flag': url_s3[-1]
         },
                   queue='jz_cj_pagesource')
Ejemplo n.º 21
0
def create_job():
    taskpath = request.form['path']
    taskparams = json.loads(request.form['params'])

    if taskpath.startswith("admin"):
        return None

    if g.user.is_authenticated():
        taskparams["user"] = str(g.user.id)

    job_id = queue_job("tasks.%s" % taskpath, taskparams)

    return json.dumps({"job_id": str(job_id)})
Ejemplo n.º 22
0
def create_job():
    taskpath = request.form['path']
    taskparams = json.loads(request.form['params'])

    if taskpath.startswith("admin"):
        return None

    if g.user.is_authenticated():
        taskparams["user"] = str(g.user.id)

    job_id = queue_job("tasks.%s" % taskpath, taskparams)

    return json.dumps({"job_id": str(job_id)})
Ejemplo n.º 23
0
def send_crawl_task(sid, tel, flow_type, province, city, timeout=5, sleep_time=1):
    alive = check_crawler_alive(sid)
    if alive:
        return False
    else:
        # send job queue to start crawler
        params = {
            'sid': sid,
            'tel': tel,
            'flow_type': flow_type,
            'province': province,
            'city': city
        }
        job_id = queue_job(config.TASK_PATH, params, queue=config.QUEUE_NAME)
        
        if len(str(job_id))>11:    
            db['sid_info'].update_one(
                    {'sid': sid}, {'$set': {'job_id':job_id}}, upsert=True)
            return True
        else:
            return False
Ejemplo n.º 24
0
 def run(self, params):
     url = params['url']
     flag = params['flag']
     try:
         info = sess.query(Jz_dongfangcaifu_PageSource).filter_by(
             url=url).first()
         sess.rollback()
         if not info:
             ps = crawl(url)
             if len(str(ps)) > 500 and '返回' not in str(ps):
                 uid = store2pg(ps=ps, url=url, flag=flag)
             else:
                 uid = None
             if uid:
                 log.info('入队列 jz_cj_parse')
                 queue_job('main_dongfangcaifu.Parse', {
                     'url': url,
                     'flag': flag
                 },
                           queue='jz_cj_parse')
         else:
             print('新闻已存在')
             log.info('入队列 jz_cj_parse')
             queue_job('main_dongfangcaifu.Parse', {
                 'url': url,
                 'flag': flag
             },
                       queue='jz_cj_parse')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_cj_pagesource')
         queue_job('main_dongfangcaifu.Crawler2', {
             'url': url,
             'flag': flag
         },
                   queue='jz_cj_pagesource')
Ejemplo n.º 25
0
 def run(self, params):
     url = params['url']
     flag = params['flag']
     try:
         info = sess.query(Jz_lingkeqiche_PageSource).filter_by(
             url=url).first()
         if not info:
             ps = crawl(url)
             if len(ps) > 100:
                 uid = store2pg(ps=ps, url=url, flag=flag)
             else:
                 uid = None
             if uid:
                 log.info('入队列 jz_qymh_parse')
                 queue_job('main_lingkeqiche.Parse', {
                     'url': url,
                     'flag': flag
                 },
                           queue='jz_qymh_parse')
         else:
             print('新闻已存在')
             log.info('入队列 jz_qymh_parse')
             queue_job('main_lingkeqiche.Parse', {
                 'url': url,
                 'flag': flag
             },
                       queue='jz_qymh_parse')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_qymh_pagesource')
         queue_job('main_lingkeqiche.Crawler2', {
             'url': url,
             'flag': flag
         },
                   queue='jz_qymh_pagesource')
Ejemplo n.º 26
0
 def run(self, params):
     url = params['url']
     flag = params['flag']
     try:
         info = sess.query(Jz_qichetansuowang_PageSource).filter_by(
             url=url).first()
         sess.rollback()
         if not info:
             ps = crawl(url)
             if len(str(ps)) > 2000:
                 uid = store2pg(ps=ps, url=url, flag=flag)
             else:
                 uid = None
             if uid:
                 log.info('入队列 jz_qckj_parse')
                 queue_job('main_qichetansuowang.Parse', {
                     'url': url,
                     'flag': flag
                 },
                           queue='jz_qckj_parse')
         else:
             print('新闻已存在,并入解析')
             queue_job('main_qichetansuowang.Parse', {
                 'url': url,
                 'flag': flag
             },
                       queue='jz_qckj_parse')
     except Exception as e:
         print(e)
         print('重新入队')
         log.info('入队列 jz_qckj_pagesource')
         queue_job('main_qichetansuowang.Crawler2', {
             'url': url,
             'flag': flag
         },
                   queue='jz_qckj_pagesource')
Ejemplo n.º 27
0
 def run(self, params):
     return queue_job(params["path"], params["params"])
Ejemplo n.º 28
0
from mrq import context
from mrq.job import queue_job

context.setup_context()

result = queue_job("crawler.Fetch", {
    "url": "http://docs.python-requests.org",
    "from": "whatever.com"
},
                   queue="crawl")

print(result)
Ejemplo n.º 29
0
 def run(self, params):
     return queue_job(params["path"], params["params"], queue=params.get("queue"))
Ejemplo n.º 30
0
    def run(self, params):
        url = params['url']
        flag = params['flag']
        try:
            info = sess1.query(Jz_changanqiche_content).filter_by(
                url=url).first()
            sess.rollback()
            if not info:
                info_2 = sess.query(Jz_changanqiche_PageSource).filter_by(
                    url=url).first()
                sess.rollback()
                ps = info_2.pagesource
                ps_uid = info_2.uid
                #                author = re.findall('',ps)
                #                author = author[0] if author else None
                public_time = re.findall('><span id="love_number">(.*?)</span',
                                         ps)
                public_time = public_time[0].strip() if public_time else None
                content1 = re.compile(
                    '<div class="news-details-main">(.*?)<div class="details-main-btn"',
                    re.S).findall(ps)
                content2 = content1[0] if content1 else None
                pic = re.findall('src="(.*?)" ', content2) if content2 else []
                for i in range(len(pic)):
                    pic[i] = 'https:' + pic[i]
                pic = ';'.join(set(pic))
                content2 = content2.replace('<br/>', '\n').replace(
                    '<img src', '[img src').replace('jpg"/>', 'jpg"/]')
                content = re.sub('<.*?>', '',
                                 content2.replace('&nbsp;', '')).replace(
                                     '$(".content img").wrap("");',
                                     '').strip()
                title = re.findall('<h2>(.*?)</h2>', ps)[0]

                meta_keywords = re.compile(
                    '<meta name="keywords" content="(.*?)">',
                    re.S).findall(ps)[0].strip()
                hid = store2pg_parse(url=url,
                                     author=None,
                                     public_time=public_time,
                                     page_source=ps_uid,
                                     content=content,
                                     website_name='长安汽车',
                                     channel_name=flag,
                                     title=title,
                                     topic=None,
                                     tag=meta_keywords,
                                     meta_keywords=None,
                                     pic=pic,
                                     flag=None)
                if hid:
                    print('完成')
            else:
                print('新闻解析已存在')
        except Exception as e:
            print(e)
            if e != "'NoneType' object has no attribute 'replace'":
                print('重新入队')
                log.info('入队列 jz_qymh_parse')
                queue_job('main_changanqiche.Parse', {
                    'url': url,
                    'flag': flag
                },
                          queue='jz_qymh_parse')
Ejemplo n.º 31
0
 def requeue_job(self, params, fpath=None, nqueue=None):
     '''
     Requeue a failed job
     '''
     log.warning('Job Failed, re-queue...%s' % params['url'])
     queue_job(fpath, params, queue=nqueue)
Ejemplo n.º 32
0
 def parser_job(self, params, fpath=None, nqueue=None):
     '''
     Add to the queue of the Parser task
     '''
     queue_job(fpath, params, queue=nqueue)
Ejemplo n.º 33
0
# -*- coding: utf-8 -*-
"""
 Created with IntelliJ IDEA.
 Description:
 User: jinhuichen
 Date: 3/19/2018 11:35 AM 
 Description: 
"""
from mrq.job import queue_job
from constants.task_name import CRAWLER_TASK
from constants.queue_name import CRAWLER
from fetchman.pipeline.pipe_item import CrawlArgs
from processors.tuliu_processor import Tuliu_Processor
# from tasks.spider_task import no_queue_task

if __name__ == '__main__':
    # res = no_queue_task({"processor": Tuliu_Processor.__name__})
    # 启动初始化任务
    res = queue_job(CRAWLER_TASK, CrawlArgs(Tuliu_Processor), queue=CRAWLER)
    print(res)
Ejemplo n.º 34
0
Archivo: crawler.py Proyecto: AshBT/mrq
    def run(self, params):

        collection = connections.mongodb_jobs.simple_crawler_urls

        response = requests.get(params["url"])

        if response.status_code != 200:
            log.warning("Got status %s on page %s (Queued from %s)" % (
                response.status_code, response.url, params.get("from")
            ))
            return False

        # Store redirects
        if response.url != params["url"]:
            collection.update({"_id": params["url"]}, {"$set": {
                "redirected_to": response.url,
                "fetched_date": datetime.datetime.now()
            }})

        document = lxml.html.fromstring(response.content)

        document.make_links_absolute(response.url)

        queued_count = 0

        document_domain = urlparse.urlparse(response.url).netloc

        for (element, attribute, link, pos) in document.iterlinks():

            link = re.sub("#.*", "", link or "")

            if not link:
                continue

            domain = urlparse.urlparse(link).netloc

            # Don't follow external links for this example
            if domain != document_domain:
                continue

            # We don't want to re-queue URLs twice. If we try to insert a duplicate,
            # pymongo will throw an error
            try:
                collection.insert({"_id": link})
            except:
                continue

            queue_job("crawler.Fetch", {
                "url": link,
                "from": params["url"]
            }, queue="crawl")
            queued_count += 1

        stored_data = {
            "_id": response.url,
            "queued_urls": queued_count,
            "html_length": len(response.content),
            "fetched_date": datetime.datetime.now()
        }

        collection.update(
            {"_id": response.url},
            stored_data,
            upsert=True
        )

        return True
Ejemplo n.º 35
0
 def run(self, params):
     return queue_job(params["path"], params["params"])
Ejemplo n.º 36
0
    def run(self, params):

        collection = connections.mongodb_jobs.simple_crawler_urls

        response = requests.get(params["url"])

        if response.status_code != 200:
            log.warning(
                "Got status %s on page %s (Queued from %s)" %
                (response.status_code, response.url, params.get("from")))
            return False

        # Store redirects
        if response.url != params["url"]:
            collection.update({"_id": params["url"]}, {
                "$set": {
                    "redirected_to": response.url,
                    "fetched_date": datetime.datetime.now()
                }
            })

        document = lxml.html.fromstring(response.content)

        document.make_links_absolute(response.url)

        queued_count = 0

        document_domain = urlparse.urlparse(response.url).netloc

        for (element, attribute, link, pos) in document.iterlinks():

            link = re.sub("#.*", "", link or "")

            if not link:
                continue

            domain = urlparse.urlparse(link).netloc

            # Don't follow external links for this example
            if domain != document_domain:
                continue

            # We don't want to re-queue URLs twice. If we try to insert a duplicate,
            # pymongo will throw an error
            try:
                collection.insert({"_id": link})
            except:
                continue

            queue_job("crawler.Fetch", {
                "url": link,
                "from": params["url"]
            },
                      queue="crawl")
            queued_count += 1

        stored_data = {
            "_id": response.url,
            "queued_urls": queued_count,
            "html_length": len(response.content),
            "fetched_date": datetime.datetime.now()
        }

        collection.update({"_id": response.url}, stored_data, upsert=True)

        return True