def getToken_key(key): appObj = db.fetchone('select * from app where unique_key=:key', {'key':key}) if not appObj: return False tokenExpired = getTime("%Y-%m-%d %H:%M:%S" , (getTime() + 7200)) token = md5(tokenExpired) db.updatebyid('app', {'token':token, 'token_expired':tokenExpired}, appObj['id']) return {'token':token, 'expired':tokenExpired}
def crawljs(taskInfo): try: #已抓取过,不再抓取 if taskInfo['status'] not in (0, 1): return True #抓取页面源代码 requestInfo = spiderRequest(taskInfo['url']) parseResults = [] results = _parseForJs(taskInfo['url']) for record in results: urlRow = _parseForUrl(record) if not urlRow: continue parseResults.append(urlRow) updateRow = {} updateRow['id'] = taskInfo['id'] updateRow['http_code'] = requestInfo['http_code'] updateRow['response_headers'] = json.dumps( requestInfo['response_headers'], ensure_ascii=False) updateRow['body'] = requestInfo['body'] updateRow['md5_body'] = md5(requestInfo['body']) updateRow['parse_result'] = json.dumps(parseResults, ensure_ascii=False) updateRow['status'] = 2 updateRow['end_at'] = getTime('%Y-%m-%d %H:%M:%S') #保存数据结果 mg_spiderjsurl_save(updateRow) except Exception as e: logger.exception(e) return False
def execCasper(content=None): try: filename = "%s/%s_%s" % (PATH_TMP_NODEJS, getTime('%Y%m%d'), md5(content)) write(filename, content) cmd = 'casperjs ' + filename child = Popen(cmd, shell=True, close_fds=True, bufsize=-1, stdout=PIPE, stderr=STDOUT) output = child.stdout.read().decode() #remove(filename) return output except Exception as e: logger.exception(e) return False
def generate(urlid): row = mgdb.spiderurl_getbyid(urlid) if not row: return { 'status': 0, 'msg': 'spider_url[%s] is not exists' % urlid, 'domirror_urlid': urlid } try: command = "phantomjs --ignore-ssl-errors=true --proxy=%s %s/mirror.js %s" % ( MIRROR_PROXY, PATH_NODEJS, row['url']) child = subprocess.Popen(command, shell=True, close_fds=True, bufsize=-1, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) lines = child.stdout.readlines() output = getTime('%Y%m%d%H%M%S') + "\t" + command processOutputs = [] for line in lines: processOutputs.append(line.decode().strip()) if child.poll(): mgdb.spiderurl_save({'status': 401}, row['id']) logger.debug("failed %s\n%s\n%s" % (row['url'], command, "\n".join(processOutputs))) else: mgdb.spiderurl_save({'status': 4}, row['id']) logger.debug("success %s\n%s\n%s" % (row['url'], command, "\n".join(processOutputs))) return {'status': 1, 'msg': 'mirror ok', 'domirror_urlid': urlid} except Exception as e: mgdb.spiderurl_save({'status': 402, 'error': repr(e)}, row['id']) logger.error("doMirror::" + str(urlid) + "::" + repr(e)) return {'status': 1, 'msg': repr(e), 'domirror_urlid': urlid}
def test4_time_int(self): now = getTime(None, '20170101') #print(now) result = isinstance(now, (int)) self.assertTrue(result)
def test3_time_str(self): now = getTime('%Y-%m-%d', '20170101') #print(now) result = isinstance(now, (str)) self.assertTrue(result)
def test2_now_str(self): now = getTime('%Y%m%d') #print(now) result = isinstance(now, (str)) self.assertTrue(result)
def test1_now(self): now = getTime() result = isinstance(now, (int)) self.assertTrue(result)
now = getTime() result = isinstance(now, (int)) self.assertTrue(result) def test2_now_str(self): now = getTime('%Y%m%d') #print(now) result = isinstance(now, (str)) self.assertTrue(result) def test3_time_str(self): now = getTime('%Y-%m-%d', '20170101') #print(now) result = isinstance(now, (str)) self.assertTrue(result) def test4_time_int(self): now = getTime(None, '20170101') #print(now) result = isinstance(now, (int)) self.assertTrue(result) params = {} params['execute_delay'] = 3600 print(getTime()) now = getTime() + params['execute_delay'] print(now, type(now) == int) dateStr = getTime('%Y%m%d %H%M%S', now) print(dateStr)
def task_save(params=None): if not params['id'] and not params['start_urls']: return False if params['start_urls']: startUrls = params['start_urls'].split("\n") params['start_urls'] = json.dumps(startUrls, ensure_ascii=False) else: params['start_urls'] = '' #默认值 defaultKeys = { 'app_id': 0, 'type': 'spider', 'start_urls': '', 'exec_level': 0, 'limit_depth': 2, 'limit_total': 1000, 'limit_time': 0, 'limit_subdomain': 0, 'limit_image': 0, 'limit_js': 0, 'url_unique_mode': 'url-query', 'notify_url': '', 'exec_level': 0, 'source_ip': '', 'exclude_urls': '', 'proxies': '', 'crontab': '', 'status': 0, } #处理定时任务 rundate = None if 'execute_at' in params.keys() and params['execute_at']: rundate = datetime.strptime(params['execute_at'], '%Y-%m-%d %H:%M:%S') if 'execute_delay' in params.keys() and params['execute_delay']: rundateStr = getTime('%Y-%m-%d %H:%M:%S', getTime() + params['execute_delay']) rundate = datetime.strptime(rundateStr, '%Y-%m-%d %H:%M:%S') #保存数据 taskdata = {} keys = defaultKeys.keys() if params['id']: taskid = params['id'] for key in keys: if key in params.keys() and params[key]: taskdata[key] = params[key] result = db.updatebyid('task', taskdata, taskid) else: taskdata['site_id'] = _getSiteid(startUrls[0]) for key in keys: if key in params.keys() and params[key]: taskdata[key] = params[key] else: taskdata[key] = defaultKeys[key] taskid = db.insert('task', taskdata) #定时任务 func_name = task_start jobid = 'task_%s' % taskid if rundate: job = db.getbyid('scheduler', jobid) if job: db.updatebyid('scheduler', {'run_date': rundate}, jobid) else: scheduler = { 'id': jobid, 'name': jobid, 'func': 'business.task:task_start', 'args': '[' + str(taskid) + ']', 'trigger_type': 'date', 'run_date': rundate, 'coalesce': 0, 'next_run_time': rundate, 'max_instances': 3, 'executor': 'default', 'misfire_grace_time ': 1, } db.insert(scheduler) return taskid #非计划任务 task = db.fetchone("select * from task where id=:id", {'id': taskid}) if not task['crontab']: task_start(taskid) return taskid #删除计划任务 if taskdata['status'] < 1 and taskdata['crontab']: db.exec('delete from scheduler where id=:id', {'id': jobid}) return taskid #添加或修改计划任务 job = db.getbyid('scheduler', jobid) cs = params['crontab'].split(' ') if job: crontab = '0 ' + task['crontab'] + ' * *,SMHdmwWY' db.updatebyid('scheduler', {'crontab': crontab}, jobid) else: tz = pytz.timezone('Asia/Shanghai') scheduler = { 'id': jobid, 'name': jobid, 'func': 'business.task:task_start', 'args': '[' + str(taskid) + ']', 'trigger_type': 'cron', 'crontab': '0 ' + task['crontab'] + ' * *,SMHdmwWY', 'coalesce': 0, 'next_run_time': datetime.now(tz=tz).strftime('%Y-%m-%d %H:%M:%S%z'), 'max_instances': 3, 'executor': 'default', 'misfire_grace_time ': 1, } db.insert('scheduler', scheduler) return taskid
def execute_finish(): '''蜘蛛是否结束''' finishIds = [] #查询所有未执行中的批次,进行检查 for row in mongoMq['stats_batch_run'].find({'is_end': 0}, { "_id": 0, 'mqkey': 1, 'batch': 1 }): mqkey = row['mqkey'] batch = row['batch'] stats = mongoMq['stats_batch_stage'].find_one( { "mqkey": mqkey, 'batch': batch }, {"_id": 0}) if not stats: logger.error("no stats_batch_stage::::%s::::%s" % (mqkey, batch)) continue if (not stats['undo'] and not stats['ready'] and not stats['doing'] and stats['done'] and stats['done'] == stats['total']): endAt = getTime('%Y-%m-%d %H:%M:%S') mongoMq['stats_batch_stage'].update( { "mqkey": mqkey, 'batch': batch }, {"$set": { 'end': 1 }}) mongoMq['stats_batch_run'].update({ "mqkey": mqkey, 'batch': batch }, {"$set": { 'is_end': 1, 'end_at': endAt }}) finishIds.append(batch) #抓取 if mqkey == 'spider': spiderResult = {'ok': 0, 'failed': 0, 'error': []} execute = mongoSpider['execute'].find_one({'id': batch}, {'_id': 0}) limit = len(json.loads(execute['start_urls'])) for row in mongoSpider['spiderurl'].find({ 'execute_id': batch }, { '_id': 0 }).sort([("id", 1)]).limit(limit): if row['error']: spiderResult['failed'] = spiderResult['failed'] + 1 spiderResult['error'].append(row['error']) else: spiderResult['ok'] = spiderResult['ok'] + 1 #如果有超过一半页面抓取失败,则抓取失败 if spiderResult['failed'] > limit / 2: result = db.updatebyid( 'task_execute', { 'status': 3, 'end_at': endAt, 'error': "\n".join(spiderResult['error']) }, batch) mgdb.execute_save({ 'status': 2, 'end_at': endAt, 'error': "\n".join(spiderResult['error']), "id": batch }) bNotify.save(batch, eventType='spider_failed') else: result = db.updatebyid('task_execute', { 'status': 2, 'end_at': endAt }, batch) mgdb.execute_save({ 'status': 2, 'end_at': endAt, "id": batch }) bNotify.save(batch, eventType='spider_ok') mqExecute = deepcopy(execute) mqExecute[mqidKey] = mqExecute['id'] mqExecute[batchKey] = mqExecute['id'] Mq.produce([mqExecute], 'piping') if mqkey == 'mirror': result = db.updatebyid('task_execute', {'status': 4}, batch) mgdb.execute_save({'status': 4, "id": batch}) bNotify.save(batch, eventType='mirror_ok') if mqkey == 'piping': result = db.updatebyid('task_execute', {'status': 4}, batch) mgdb.execute_save({'status': 4, "id": batch}) bNotify.save(batch, 'piping_all', {'piping_status': 'ok'}) return finishIds