def run(self, params): url = params['url'] flag = params['flag'] print(url) try: ps = crawl(url) print(len(ps)) if len(ps) > 10: uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: urls = re.findall('"supdata_whereid":"(.*?)"', ps) for u in urls: url = 'http://www.changan.com.cn/news-details.shtml?whereid=%s&column_id=98' % u log.info('入队列 jz_qymh_pagesource') queue_job('main_changanqiche.Crawler2', { 'url': url, 'flag': flag }, queue='jz_qymh_pagesource') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_qymh_pagesource') queue_job('main_changanqiche.Crawler1', { 'url': url, 'flag': flag }, queue='jz_qymh_pagesource')
def run(self, params): url = params['url'] flag = params['flag'] print(url) try: ps = crawl(url) if len(ps) > 2000: uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: urls = re.compile( '<a target="_blank" href="(http://www.feelcars.com/.*?html)">', re.S).findall(ps) for u in urls: url = u log.info('入队列 jz_qckj_pagesource') queue_job('main_qichetansuowang.Crawler2', { 'url': url, 'flag': flag }, queue='jz_qckj_pagesource') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_qckj_pagesource') queue_job('main_qichetansuowang.Crawler1', { 'url': url, 'flag': flag }, queue='jz_qckj_pagesource')
def run(self, params): url = params['url'] flag = params['flag'] print(url) try: ps = crawl(url) if len(str(ps)) > 500: uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: urls = list( re.compile( '<p class="title">.*?<a href="(http:.*?)" target="_blank">', re.S).findall(ps)) for u in urls: url1 = u log.info('入队列 jz_cj_pagesource') queue_job('main_dongfangcaifu.Crawler2', { 'url': url1, 'flag': flag }, queue='jz_cj_pagesource') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_cj_pagesource') queue_job('main_dongfangcaifu.Crawler1', { 'url': url, 'flag': flag }, queue='jz_cj_pagesource')
def run(self, params): url = params['url'] flag = params['flag'] print(url) try: ps = crawl(url) print(len(ps)) if len(ps) > 100: uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: urls = re.findall('<a class="newsLink" href="(.*?)">', ps) for u in urls: url = 'https://www.lynkco.com.cn' + u log.info('入队列 jz_qymh_pagesource') queue_job('main_lingkeqiche.Crawler2', { 'url': url, 'flag': flag }, queue='jz_qymh_pagesource') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_qymh_pagesource') queue_job('main_lingkeqiche.Crawler1', { 'url': url, 'flag': flag }, queue='jz_qymh_pagesource')
def requeue_job(self, params, fpath=None, nqueue=None): ''' requeue an unfinished job run_wrapped的helper function ''' log.warning('Job Failed, re-queue...%s' % params['url']) queue_job(fpath, params, queue=nqueue)
def run(self, params): users = [{"user": x} for x in get_distinct_users()] print "Queue users" queue_jobs("bluebirdlib.tasks.GetUserTweets", users, queue="tweets") hts = [{"search": x} for x in HASHTAGS] print "Queue hashtags" queue_jobs("bluebirdlib.tasks.getHashTagTweets", hts, queue="tweets") print "Queue scheduler" queue_job("bluebirdlib.tasks.Scheduler", {}, queue="tweets") return 0
def initialize_jobs(bucket_name): setup_context() jobs_count = 0 conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY) bucket = Bucket(connection=conn, name=bucket_name) for key in bucket.list(): queue_job("tasks.Download", { "bucket_name": bucket_name, "key_name": key.key }, queue=download_queue) jobs_count += 1 return jobs_count
def run(self, params): #初始化的时候,选用页面长度为30 #每日增量,只要一页就可以 url_s1 = ('http://www.feelcars.com/category/xinnengyuan/page/%s', '新能源') end = 4 for i in range(1, end): url = url_s1[0] % str(i) log.info('入队列 jz_qckj_pagesource') queue_job('main_qichetansuowang.Crawler1', { 'url': url, 'flag': url_s1[-1] }, queue='jz_qckj_pagesource')
def main(): parser = argparse.ArgumentParser(description='Runs a task') cfg = config.get_config(parser=parser, config_type="run", sources=("file", "env", "args")) cfg["is_cli"] = True set_current_config(cfg) if len(cfg["taskargs"]) == 1: params = json.loads(cfg["taskargs"][0]) # pylint: disable=no-member else: params = {} # mrq-run taskpath a 1 b 2 => {"a": "1", "b": "2"} for group in utils.group_iter(cfg["taskargs"], n=2): if len(group) != 2: print("Number of arguments wasn't even") sys.exit(1) params[group[0]] = group[1] if cfg["queue"]: ret = queue_job(cfg["taskpath"], params, queue=cfg["queue"]) print(ret) else: worker_class = load_class_by_path(cfg["worker_class"]) job = worker_class.job_class(None) job.set_data({ "path": cfg["taskpath"], "params": params, "queue": cfg["queue"] }) job.datestarted = datetime.datetime.utcnow() set_current_job(job) ret = job.perform() print(json_stdlib.dumps(ret, cls=MongoJSONEncoder)) # pylint: disable=no-member
def api_job_action(): params = {k: v for k, v in iteritems(request.form)} if params.get("status") and "-" in params.get("status"): params["status"] = params.get("status").split("-") return jsonify({"job_id": queue_job("mrq.basetasks.utils.JobAction", params, queue=get_current_config()["dashboard_queue"])})
def api_job_action(): params = {k: v for k, v in request.form.iteritems()} if params.get("status") and "-" in params.get("status"): params["status"] = params.get("status").split("-") return jsonify({"job_id": queue_job("mrq.basetasks.utils.JobAction", params, queue=get_current_config()["dashboard_queue"])})
def run(self, params): params1 = ( 'https://www.lynkco.com.cn/Brand/News/NewsMore?pageIndex=%s', '新闻潮讯') #初始化的时候,选用页面长度为30 for u in [params1]: if u[-1] == '新闻潮讯': end = 8 for i in range(1, end): url = u[0] % str(i) log.info('入队列 jz_qymh_pagesource') queue_job('main_lingkeqiche.Crawler1', { 'url': url, 'flag': u[-1] }, queue='jz_qymh_pagesource')
def run(self, params): key_name = params["key_name"] _, extension = os.path.splitext(key_name) if (extension == ".jpg"): conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY) bucket = conn.get_bucket(params["bucket_name"]) key = bucket.get_key(key_name) key.get_contents_to_filename(key_name) log.info("Succesfully downloaded file from s3 bucket %s", key_name) queue_job("tasks.Write", {"key_name": key_name}, queue=write_queue) else: #TODO handle compressed and other file types log.warn( "Currently unable to handle file extension type for file %s", key_name) os.remove(key_name)
def api_queue_job(task): #存在参数相互依赖的关系 无法使用 ApiSchemaWrapper queue = request.args.get('queue', '').strip() args = request.args.to_dict() params, err = fixTaskParams(MRQ_TASK_DICT, task, args) rst = ApiErrorBuild() rst = {"job_id": queue_job(task, params, queue=queue if queue else None)} if err is None else err return jsonify(rst)
def exec_push_work(self, url): # import subprocess # modify arguments # 這些是會傳下去給spider的函數 除了spider需要用的之外,還包括所有栈名以及其任務的路徑 # 如果有更改栈名請於此更改 args = { 'url': url, 'spiderTask': 'spider.spider.__Spider', 'spiderqueue': 'crawl_posts', 'parseTask': 'parser.parse_posts.__Parser', 'parsequeue': 'parse_posts' } # task = ['spider.spider_crawl.LcSpider'] # command = ['mrq-run'] + task + args # '--queue', 'crawl_posts' # subprocess.Popen(command) queue_job(args['spiderTask'], args, queue=args['spiderqueue'])
def run(self, params): params1 = ( 'https://www.changan.com.cn/news-changan?page=%s&year=%s&keyword=&type=0&ajax_req=1&t=1584689024944', '长安动态') params2 = ('http://www.changan.com.cn/company.shtml', '合资合作') #初始化的时候,选用页面长度为30 for u in [params1]: if u[-1] == '长安动态': year = datetime.datetime.now().strftime('%Y') #每周一次,一次一页(8篇) for page in range(1, 2): url = u[0] % (str(page), str(year)) log.info('入队列 jz_qymh_pagesource') queue_job('main_changanqiche.Crawler1', { 'url': url, 'flag': u[-1] }, queue='jz_qymh_pagesource')
def _crawl(self, batch): responses = self._downloader.download(batch) if self._time_sleep: time.sleep(self._time_sleep) for response in responses: self._processor.logger.info(response) callback = response.request.callback(response) if isinstance(callback, types.GeneratorType): pipe = self._queue.get_pipe() for item in callback: if isinstance(item, Request): # logger.info("push request to queue..." + str(item)) if self._should_follow(item): self._queue.push_pipe(item, pipe) elif isinstance(item, PipeItem): # 如果返回对象是pipeItem,则用对应的pipeline处理 self._process_count += 1 for pipe_name in item.pipe_names: queue_job(PIPELINE_TASK, PipelineArgs(pipe_name, item.result), queue=PIPELINE) if self.test: if self._process_count > 0: return elif isinstance( item, Violet): # 如果返回的是tuple,即详情页的processor和详情页的请求信息 queue_job(CRAWLER_TASK, CrawlArgs(item.processor, item.request), queue=CRAWLER) else: raise Exception('not return correct value!!!') pipe.execute() elif isinstance(callback, Request): # logger.info("push request to queue..." + str(back)) if self._should_follow(callback): self._queue.push(callback) elif isinstance(callback, PipeItem): # 如果返回对象是pipeItem,则用对应的pipeline处理 self._process_count += 1 for pipe_name in callback.pipe_names: queue_job(PIPELINE_TASK, PipelineArgs(pipe_name, callback.result), queue=PIPELINE) elif isinstance(callback, Violet): # 如果返回的是tuple,即详情页的processor和详情页的请求信息 queue_job(CRAWLER_TASK, CrawlArgs(item.processor, item.request), queue=CRAWLER) else: # # 如果返回对象不是pipeItem,则默认用每个pipeline处理 raise Exception('not return correct value!!!')
def wait_for_job(path, params, **kwargs): job_id = queue_job(path, params, **kwargs) while True: time.sleep(5) res = get_job_result(job_id) if res["status"] == "success": return res.get("result") elif res["status"] not in ["queued", "started", "interrupt"]: raise Exception("Job %s was in status %s" % (path, res.get("status")))
def wait_for_job(path, params, **kwargs): job_id = queue_job(path, params, **kwargs) while True: time.sleep(5) res = get_job_result(job_id) if res["status"] == "success": return res.get("result") elif res["status"] not in ["queued", "started", "interrupt"]: raise Exception("Job %s was in status %s" % ( path, res.get("status") ))
def run(self, params): #初始化的时候,选用页面长度为30 #每日增量,只要一页就可以 url_s1 = ('http://finance.eastmoney.com/news/cjjsp_%s.html', '经济时评') url_s2 = ('http://finance.eastmoney.com/news/cgnjj_%s.html', '国内经济') url_s3 = ('http://finance.eastmoney.com/news/cgjjj_%s.html', '国际经济') # for i in range(1,4): for i in range(1, 26): url1 = url_s1[0] % str(i) log.info('入队列 jz_cj_pagesource') queue_job('main_dongfangcaifu.Crawler1', { 'url': url1, 'flag': url_s1[-1] }, queue='jz_cj_pagesource') url2 = url_s2[0] % str(i) log.info('入队列 jz_cj_pagesource') queue_job('main_dongfangcaifu.Crawler1', { 'url': url2, 'flag': url_s2[-1] }, queue='jz_cj_pagesource') url3 = url_s3[0] % str(i) log.info('入队列 jz_cj_pagesource') queue_job('main_dongfangcaifu.Crawler1', { 'url': url3, 'flag': url_s3[-1] }, queue='jz_cj_pagesource')
def create_job(): taskpath = request.form['path'] taskparams = json.loads(request.form['params']) if taskpath.startswith("admin"): return None if g.user.is_authenticated(): taskparams["user"] = str(g.user.id) job_id = queue_job("tasks.%s" % taskpath, taskparams) return json.dumps({"job_id": str(job_id)})
def send_crawl_task(sid, tel, flow_type, province, city, timeout=5, sleep_time=1): alive = check_crawler_alive(sid) if alive: return False else: # send job queue to start crawler params = { 'sid': sid, 'tel': tel, 'flow_type': flow_type, 'province': province, 'city': city } job_id = queue_job(config.TASK_PATH, params, queue=config.QUEUE_NAME) if len(str(job_id))>11: db['sid_info'].update_one( {'sid': sid}, {'$set': {'job_id':job_id}}, upsert=True) return True else: return False
def run(self, params): url = params['url'] flag = params['flag'] try: info = sess.query(Jz_dongfangcaifu_PageSource).filter_by( url=url).first() sess.rollback() if not info: ps = crawl(url) if len(str(ps)) > 500 and '返回' not in str(ps): uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: log.info('入队列 jz_cj_parse') queue_job('main_dongfangcaifu.Parse', { 'url': url, 'flag': flag }, queue='jz_cj_parse') else: print('新闻已存在') log.info('入队列 jz_cj_parse') queue_job('main_dongfangcaifu.Parse', { 'url': url, 'flag': flag }, queue='jz_cj_parse') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_cj_pagesource') queue_job('main_dongfangcaifu.Crawler2', { 'url': url, 'flag': flag }, queue='jz_cj_pagesource')
def run(self, params): url = params['url'] flag = params['flag'] try: info = sess.query(Jz_lingkeqiche_PageSource).filter_by( url=url).first() if not info: ps = crawl(url) if len(ps) > 100: uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: log.info('入队列 jz_qymh_parse') queue_job('main_lingkeqiche.Parse', { 'url': url, 'flag': flag }, queue='jz_qymh_parse') else: print('新闻已存在') log.info('入队列 jz_qymh_parse') queue_job('main_lingkeqiche.Parse', { 'url': url, 'flag': flag }, queue='jz_qymh_parse') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_qymh_pagesource') queue_job('main_lingkeqiche.Crawler2', { 'url': url, 'flag': flag }, queue='jz_qymh_pagesource')
def run(self, params): url = params['url'] flag = params['flag'] try: info = sess.query(Jz_qichetansuowang_PageSource).filter_by( url=url).first() sess.rollback() if not info: ps = crawl(url) if len(str(ps)) > 2000: uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: log.info('入队列 jz_qckj_parse') queue_job('main_qichetansuowang.Parse', { 'url': url, 'flag': flag }, queue='jz_qckj_parse') else: print('新闻已存在,并入解析') queue_job('main_qichetansuowang.Parse', { 'url': url, 'flag': flag }, queue='jz_qckj_parse') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_qckj_pagesource') queue_job('main_qichetansuowang.Crawler2', { 'url': url, 'flag': flag }, queue='jz_qckj_pagesource')
def run(self, params): return queue_job(params["path"], params["params"])
from mrq import context from mrq.job import queue_job context.setup_context() result = queue_job("crawler.Fetch", { "url": "http://docs.python-requests.org", "from": "whatever.com" }, queue="crawl") print(result)
def run(self, params): return queue_job(params["path"], params["params"], queue=params.get("queue"))
def run(self, params): url = params['url'] flag = params['flag'] try: info = sess1.query(Jz_changanqiche_content).filter_by( url=url).first() sess.rollback() if not info: info_2 = sess.query(Jz_changanqiche_PageSource).filter_by( url=url).first() sess.rollback() ps = info_2.pagesource ps_uid = info_2.uid # author = re.findall('',ps) # author = author[0] if author else None public_time = re.findall('><span id="love_number">(.*?)</span', ps) public_time = public_time[0].strip() if public_time else None content1 = re.compile( '<div class="news-details-main">(.*?)<div class="details-main-btn"', re.S).findall(ps) content2 = content1[0] if content1 else None pic = re.findall('src="(.*?)" ', content2) if content2 else [] for i in range(len(pic)): pic[i] = 'https:' + pic[i] pic = ';'.join(set(pic)) content2 = content2.replace('<br/>', '\n').replace( '<img src', '[img src').replace('jpg"/>', 'jpg"/]') content = re.sub('<.*?>', '', content2.replace(' ', '')).replace( '$(".content img").wrap("");', '').strip() title = re.findall('<h2>(.*?)</h2>', ps)[0] meta_keywords = re.compile( '<meta name="keywords" content="(.*?)">', re.S).findall(ps)[0].strip() hid = store2pg_parse(url=url, author=None, public_time=public_time, page_source=ps_uid, content=content, website_name='长安汽车', channel_name=flag, title=title, topic=None, tag=meta_keywords, meta_keywords=None, pic=pic, flag=None) if hid: print('完成') else: print('新闻解析已存在') except Exception as e: print(e) if e != "'NoneType' object has no attribute 'replace'": print('重新入队') log.info('入队列 jz_qymh_parse') queue_job('main_changanqiche.Parse', { 'url': url, 'flag': flag }, queue='jz_qymh_parse')
def requeue_job(self, params, fpath=None, nqueue=None): ''' Requeue a failed job ''' log.warning('Job Failed, re-queue...%s' % params['url']) queue_job(fpath, params, queue=nqueue)
def parser_job(self, params, fpath=None, nqueue=None): ''' Add to the queue of the Parser task ''' queue_job(fpath, params, queue=nqueue)
# -*- coding: utf-8 -*- """ Created with IntelliJ IDEA. Description: User: jinhuichen Date: 3/19/2018 11:35 AM Description: """ from mrq.job import queue_job from constants.task_name import CRAWLER_TASK from constants.queue_name import CRAWLER from fetchman.pipeline.pipe_item import CrawlArgs from processors.tuliu_processor import Tuliu_Processor # from tasks.spider_task import no_queue_task if __name__ == '__main__': # res = no_queue_task({"processor": Tuliu_Processor.__name__}) # 启动初始化任务 res = queue_job(CRAWLER_TASK, CrawlArgs(Tuliu_Processor), queue=CRAWLER) print(res)
def run(self, params): collection = connections.mongodb_jobs.simple_crawler_urls response = requests.get(params["url"]) if response.status_code != 200: log.warning("Got status %s on page %s (Queued from %s)" % ( response.status_code, response.url, params.get("from") )) return False # Store redirects if response.url != params["url"]: collection.update({"_id": params["url"]}, {"$set": { "redirected_to": response.url, "fetched_date": datetime.datetime.now() }}) document = lxml.html.fromstring(response.content) document.make_links_absolute(response.url) queued_count = 0 document_domain = urlparse.urlparse(response.url).netloc for (element, attribute, link, pos) in document.iterlinks(): link = re.sub("#.*", "", link or "") if not link: continue domain = urlparse.urlparse(link).netloc # Don't follow external links for this example if domain != document_domain: continue # We don't want to re-queue URLs twice. If we try to insert a duplicate, # pymongo will throw an error try: collection.insert({"_id": link}) except: continue queue_job("crawler.Fetch", { "url": link, "from": params["url"] }, queue="crawl") queued_count += 1 stored_data = { "_id": response.url, "queued_urls": queued_count, "html_length": len(response.content), "fetched_date": datetime.datetime.now() } collection.update( {"_id": response.url}, stored_data, upsert=True ) return True
def run(self, params): collection = connections.mongodb_jobs.simple_crawler_urls response = requests.get(params["url"]) if response.status_code != 200: log.warning( "Got status %s on page %s (Queued from %s)" % (response.status_code, response.url, params.get("from"))) return False # Store redirects if response.url != params["url"]: collection.update({"_id": params["url"]}, { "$set": { "redirected_to": response.url, "fetched_date": datetime.datetime.now() } }) document = lxml.html.fromstring(response.content) document.make_links_absolute(response.url) queued_count = 0 document_domain = urlparse.urlparse(response.url).netloc for (element, attribute, link, pos) in document.iterlinks(): link = re.sub("#.*", "", link or "") if not link: continue domain = urlparse.urlparse(link).netloc # Don't follow external links for this example if domain != document_domain: continue # We don't want to re-queue URLs twice. If we try to insert a duplicate, # pymongo will throw an error try: collection.insert({"_id": link}) except: continue queue_job("crawler.Fetch", { "url": link, "from": params["url"] }, queue="crawl") queued_count += 1 stored_data = { "_id": response.url, "queued_urls": queued_count, "html_length": len(response.content), "fetched_date": datetime.datetime.now() } collection.update({"_id": response.url}, stored_data, upsert=True) return True