def result_save(execute, piping, results): #数据入库 pipingResult = {} pipingResult['app_id'] = execute['app_id'] pipingResult['site_id'] = execute['site_id'] pipingResult['task_id'] = execute['task_id'] pipingResult['execute_id'] = execute['id'] pipingResult['piping_id'] = piping['id'] pipingResult['type'] = piping['type'] ''' 处理的结果以json字符串的形式保存 包含敏感词,关键字,指纹,错误状态码,暗链 ''' pipingResult['result'] = json.dumps(results, ensure_ascii=False) pipingResult['status'] = 1 pipingResult['audit_status'] = 0 resultOld = db.fetchone('select id from task_piping_result where execute_id=:eid and piping_id=:pid', {'eid': execute['id'], 'pid': piping['id']}) if resultOld: resultId = resultOld['id'] db.updatebyid('task_piping_result', pipingResult, resultId) else: resultId = db.insert('task_piping_result', pipingResult) bNotify.save(execute['id'], 'piping_%s' % piping['type'], {'piping_status':'ok'}) return resultId
def init_system(): #导入初始token sql = "insert into app(id, unique_key, public_key, token, token_expired) values(1,'tester_app','-----BEGIN RSA PUBLIC KEY-----\nMIIBCgKCAQEAvLWMYgTwkLMI8ZSw8Pd7NBKUVr0kbyqHijKOOQmR5/EKHOwgak0u\nu3+wBsllmIgfa4cT0zp4Gdd4hx2UmpIjG4eHwCgUCHHmCedu87/zEQhzE2do9p09\nBzPs7GG/azuynPJp6mZFxycaGZaoHH1d3FNWJ+yRBQ5UliFw01Tby3j7cV5u9fNU\nOjSZRGBNkHLxUi56kkbIZ46Wz14DVCjfZh6HRcwWKZHnQTDaIJKGKDbJoAbY/EIi\nrUc8OQl57PNq35hc0AJdFHa5oDQ5WtsCXx3q7XNhKjZdR/Vs4kljns5k9/zylJLn\nXI5ly2j46nz+feMaGVP1BdJpPUVWrAcgFQIDAQAB\n-----END RSA PUBLIC KEY-----','wbsllmigfa4ct0zp4gdd4hx2umpijg4e', '2017-03-22 17:01:53');" db.exec(sql) #@系统敏感词库 pipingType = 'filterword' row = db.fetchone( "select * from piping_extend where task_id=:task_id and piping_type=:piping_type limit 1", { 'task_id': 0, 'piping_type': pipingType }) if not row: insertRow = { "app_id": 0, "site_id": 0, "task_id": 0, "piping_type": pipingType, "data": "", "status": 1 } extendId = db.insert("piping_extend", insertRow) else: extendId = row["id"] content = read("%s/doc/sensitive_word.txt" % PATH_ROOT) db.updatebyid("piping_extend", { "data": content.strip(), "status": 1 }, extendId) #@系统异常状态码 pipingType = 'err_http_code' row = db.fetchone( "select * from piping_extend where task_id=:task_id and piping_type=:piping_type limit 1", { 'task_id': 0, 'piping_type': pipingType }) if not row: insertRow = { "app_id": 0, "site_id": 0, "task_id": 0, "piping_type": pipingType, "data": "", "status": 1, } extendId = db.insert("piping_extend", insertRow) else: extendId = row["id"] content = "\n".join( ['401', '402', '403', '404', '405', '500', '501', '502', '503', '504']) db.updatebyid("piping_extend", { "data": content.strip(), "status": 1 }, extendId)
def getToken_key(key): appObj = db.fetchone('select * from app where unique_key=:key', {'key':key}) if not appObj: return False tokenExpired = getTime("%Y-%m-%d %H:%M:%S" , (getTime() + 7200)) token = md5(tokenExpired) db.updatebyid('app', {'token':token, 'token_expired':tokenExpired}, appObj['id']) return {'token':token, 'expired':tokenExpired}
def save(params = None): if not params['unique_key']: return False appdata = { 'unique_key': params['unique_key'], 'public_key': params['public_key'], } if params['id']: appid = params['id'] db.updatebyid('app', appdata, appid) else: appid = db.insert('app', appdata) return appid
def save(params=None): if not params['name'] or not params['key'] or not params['value']: return False settingdata = { 'name': params['name'], 'key': params['key'], 'value': params['value'], 'note': _defaultValue(params, 'note', ''), } if params['id']: settingid = params['id'] db.updatebyid('setting', settingdata, settingid) else: settingid = db.insert('setting', settingdata) return settingid
def save(params=None): if not params['ip'] or not params['port']: return False proxydata = { 'ip': params['ip'], 'port': params['port'], 'username': _defaultValue(params, 'username', ''), 'passwd': _defaultValue(params, 'passwd', ''), } if params['id']: proxyid = params['id'] db.updatebyid('proxy', proxydata, proxyid) else: proxydata['status'] = 0 proxyid = db.insert('proxy', proxydata) return proxyid
def send(notifyid = None): reData = {'status':0, 'msg':'', 'donotify_notifyid':notifyid} row = db.fetchone("select * from task_notify where id=:id", {'id': notifyid}) if not row: return {'status':0, 'msg':'notify[%s] is not exists' % notifyid, 'donotify_notifyid':notifyid} try: data = { 'id': row['id'], 'app_id': row['app_id'], 'task_id': row['task_id'], 'site_id': row['site_id'], 'execute_id': row['execute_id'], 'task_type': row['task_type'], } requestData = json.loads(row['request_data']) if row['request_data'] else {} data = dict(data, **requestData) data = json.dumps(data, ensure_ascii=False) request = Request(row['notify_url'], method='POST') request.add_header('Content-Type', 'application/json') response = urlopen(request, data.encode('utf8'), timeout=5) body = response.read().decode() if body == 'ok': db.updatebyid('task_notify', {'status':'2', 'response_data':body, 'error': ''}, row['id']) else: error = 'the httpcode require 200, the body require ok;' db.updatebyid('task_notify', {'status':'301', 'response_data':body, 'error': error}, row['id']) return {'status':1, 'msg':'notify ok', 'donotify_notifyid':notifyid} except Exception as e: logger.error("doNotify::" + str(notifyid) + "::" + repr(e)) db.updatebyid('task_notify', {'status':'3', 'error':repr(e)}, row['id']) return {'status':1, 'msg':repr(e), 'donotify_notifyid':notifyid}
def piping_all(executeid): '''执行任务处理''' reData = {'status':0, 'msg':'', 'dopiping_executeid':executeid} row = mgdb.execute_getbyid(executeid) if not row: return {'status':0, 'msg':'task_execute[%s] is not exists' % executeid, 'dopiping_executeid':executeid} types = db.fetchall("select type from task_piping where task_id=:id", {'id':row['task_id']}) if not types: db.updatebyid('task_execute', {'status':'4'}, row['id']) return {'status':1, 'msg':'piping ok', 'dopiping_executeid':executeid} for row1 in types: pipingType = row1['type'] if pipingType == 'filterword': piping_filterword(row['id']) if pipingType == 'keyword': piping_keyword(row['id']) if pipingType == 'error_http_code': piping_errorHttpCode(row['id']) if pipingType == 'fingerprint': result = piping_fingerprint(row['id']) if pipingType == 'darklink': result = piping_darklink(row['id']) return {'status':1, 'msg':'piping ok', 'dopiping_executeid':executeid}
def init_app(): '''只系统上线,初始化时使用,谨慎操作''' return True #tsgz appid = 2 app = { 'id': appid, 'unique_key': 'tsgz', 'public_key': "-----BEGIN RSA PUBLIC KEY-----\nMIIBCgKCAQEA9Vp7hhFpJe2zYuGDDBQ2wb0e7tKHwfHdE6e8ZUJDkMgPLKBEbHwo\nSuvLXgrtGqjclVSIn6Py+NmQbtWxnOZuV/2O/jzhnflu8vVoXVwEuj4gj3+jGZV4\nB0MFICeZ/+qM2UcqrquxQrLhV1gU8InaaTgkMtC4Iag38YdDUy6MdBH7yOQzmUuq\nd5PhbsZeb45Y2OSuq2jhg3d1Xu1vHIrj1A0jSs99d5lOdubpCu7l1JC3WrjVBISj\nlQnrQmUATVy6Tr0Wvv8n1hqaZVNGpAM6pI4UtF+OldU7MrNqQzc+8a5hj2A2SGZE\nfPgyjaS8p+/K4tECY0STfXtB7wjg8oU8bQIDAQAB\n-----END RSA PUBLIC KEY-----", 'token': '9a684815a09c65edb52b7612cda4b1ad', 'token_expired': now_format(), } row = db.getbyid('app', appid) if row: db.updatebyid('app', app, appid) else: db.insert('app', app) #homev5_apiv4 appid = 3 app = { 'id': appid, 'unique_key': 'homev5_apiv4', 'public_key': "-----BEGIN RSA PUBLIC KEY-----\nMIIBCgKCAQEA7LRjexk787YP48ZiWOwHNa93VF+J0H/pdINSvqIqWU6yAarpkLWq\nKV9Xd27QCcK6z459b0v/S6QplOPWks7m0MCFrflxkAEjd6MtXJiq3a6rcnX1w0vu\nPozNcM8ibLQI6XoSWNx2sUlQDpDdT9JvdGsnoCfY+pAS3gycgAHzFJH9UbY68igk\nn1cqFuADso3YLXZssK+eslnsfK20iZPiobmSWACLz0vi0gxTABSLqXM3ovJBZgiB\n0QUqvKJY1pM0dHpVpnj73y3CutqH+v255x32y2DVfG4AC6hxCojIhQDx8vAqsKc1\nHYcKxCTPGGVGGvmDUDevwvmvF+GjDZ0SQQIDAQAB\n-----END RSA PUBLIC KEY-----", 'token': '20d812f96badf9f811cde6f9916d5a50', 'token_expired': now_format(), } row = db.getbyid('app', appid) if row: db.updatebyid('app', app, appid) else: db.insert('app', app) #homev5_apiv4_mirror appid = 4 app = { 'id': appid, 'unique_key': 'homev5_apiv4_mirror', 'public_key': "-----BEGIN RSA PUBLIC KEY-----\nMIIBCgKCAQEAwT90eOKM9YaUDYM1v2WR1TL7Qf1t3e3ogCkFSSbH0D4IBn/bVOi9\nCq7jDRZH9F75j6uGXymMLGF841kgrgn8NdyalqaLGRrufw+K971UfNToT/SEAW9O\n+HlZLIV+itAVbBly5/LJFc16aPUH2L47r8qFIFB0PfjLSAsHhbRRs6jLyuZTtzGi\no4iod7/5R+ip216fu7cxiAE3wBhfKTT7IYnAnW7+tYLPqlGcszJkSJtZozHcxudw\n4nVRu+2pkP9ud1YnbWSVGDADMQ33YaKSrm4O+dCDw5EqhmYo+0xH39TNS/2GjCK2\n83R0ZvuS9KkYCNhSYYEKVKiyTuavTpsWWwIDAQAB\n-----END RSA PUBLIC KEY-----", 'token': '69866dde69bced6b006708b936e038c3', 'token_expired': now_format(), } row = db.getbyid('app', appid) if row: db.updatebyid('app', app, appid) else: db.insert('app', app) return True
def task_save(params=None): if not params['id'] and not params['start_urls']: return False if params['start_urls']: startUrls = params['start_urls'].split("\n") params['start_urls'] = json.dumps(startUrls, ensure_ascii=False) else: params['start_urls'] = '' #默认值 defaultKeys = { 'app_id': 0, 'type': 'spider', 'start_urls': '', 'exec_level': 0, 'limit_depth': 2, 'limit_total': 1000, 'limit_time': 0, 'limit_subdomain': 0, 'limit_image': 0, 'limit_js': 0, 'url_unique_mode': 'url-query', 'notify_url': '', 'exec_level': 0, 'source_ip': '', 'exclude_urls': '', 'proxies': '', 'crontab': '', 'status': 0, } #处理定时任务 rundate = None if 'execute_at' in params.keys() and params['execute_at']: rundate = datetime.strptime(params['execute_at'], '%Y-%m-%d %H:%M:%S') if 'execute_delay' in params.keys() and params['execute_delay']: rundateStr = getTime('%Y-%m-%d %H:%M:%S', getTime() + params['execute_delay']) rundate = datetime.strptime(rundateStr, '%Y-%m-%d %H:%M:%S') #保存数据 taskdata = {} keys = defaultKeys.keys() if params['id']: taskid = params['id'] for key in keys: if key in params.keys() and params[key]: taskdata[key] = params[key] result = db.updatebyid('task', taskdata, taskid) else: taskdata['site_id'] = _getSiteid(startUrls[0]) for key in keys: if key in params.keys() and params[key]: taskdata[key] = params[key] else: taskdata[key] = defaultKeys[key] taskid = db.insert('task', taskdata) #定时任务 func_name = task_start jobid = 'task_%s' % taskid if rundate: job = db.getbyid('scheduler', jobid) if job: db.updatebyid('scheduler', {'run_date': rundate}, jobid) else: scheduler = { 'id': jobid, 'name': jobid, 'func': 'business.task:task_start', 'args': '[' + str(taskid) + ']', 'trigger_type': 'date', 'run_date': rundate, 'coalesce': 0, 'next_run_time': rundate, 'max_instances': 3, 'executor': 'default', 'misfire_grace_time ': 1, } db.insert(scheduler) return taskid #非计划任务 task = db.fetchone("select * from task where id=:id", {'id': taskid}) if not task['crontab']: task_start(taskid) return taskid #删除计划任务 if taskdata['status'] < 1 and taskdata['crontab']: db.exec('delete from scheduler where id=:id', {'id': jobid}) return taskid #添加或修改计划任务 job = db.getbyid('scheduler', jobid) cs = params['crontab'].split(' ') if job: crontab = '0 ' + task['crontab'] + ' * *,SMHdmwWY' db.updatebyid('scheduler', {'crontab': crontab}, jobid) else: tz = pytz.timezone('Asia/Shanghai') scheduler = { 'id': jobid, 'name': jobid, 'func': 'business.task:task_start', 'args': '[' + str(taskid) + ']', 'trigger_type': 'cron', 'crontab': '0 ' + task['crontab'] + ' * *,SMHdmwWY', 'coalesce': 0, 'next_run_time': datetime.now(tz=tz).strftime('%Y-%m-%d %H:%M:%S%z'), 'max_instances': 3, 'executor': 'default', 'misfire_grace_time ': 1, } db.insert('scheduler', scheduler) return taskid
def execute_finish(): '''蜘蛛是否结束''' finishIds = [] #查询所有未执行中的批次,进行检查 for row in mongoMq['stats_batch_run'].find({'is_end': 0}, { "_id": 0, 'mqkey': 1, 'batch': 1 }): mqkey = row['mqkey'] batch = row['batch'] stats = mongoMq['stats_batch_stage'].find_one( { "mqkey": mqkey, 'batch': batch }, {"_id": 0}) if not stats: logger.error("no stats_batch_stage::::%s::::%s" % (mqkey, batch)) continue if (not stats['undo'] and not stats['ready'] and not stats['doing'] and stats['done'] and stats['done'] == stats['total']): endAt = getTime('%Y-%m-%d %H:%M:%S') mongoMq['stats_batch_stage'].update( { "mqkey": mqkey, 'batch': batch }, {"$set": { 'end': 1 }}) mongoMq['stats_batch_run'].update({ "mqkey": mqkey, 'batch': batch }, {"$set": { 'is_end': 1, 'end_at': endAt }}) finishIds.append(batch) #抓取 if mqkey == 'spider': spiderResult = {'ok': 0, 'failed': 0, 'error': []} execute = mongoSpider['execute'].find_one({'id': batch}, {'_id': 0}) limit = len(json.loads(execute['start_urls'])) for row in mongoSpider['spiderurl'].find({ 'execute_id': batch }, { '_id': 0 }).sort([("id", 1)]).limit(limit): if row['error']: spiderResult['failed'] = spiderResult['failed'] + 1 spiderResult['error'].append(row['error']) else: spiderResult['ok'] = spiderResult['ok'] + 1 #如果有超过一半页面抓取失败,则抓取失败 if spiderResult['failed'] > limit / 2: result = db.updatebyid( 'task_execute', { 'status': 3, 'end_at': endAt, 'error': "\n".join(spiderResult['error']) }, batch) mgdb.execute_save({ 'status': 2, 'end_at': endAt, 'error': "\n".join(spiderResult['error']), "id": batch }) bNotify.save(batch, eventType='spider_failed') else: result = db.updatebyid('task_execute', { 'status': 2, 'end_at': endAt }, batch) mgdb.execute_save({ 'status': 2, 'end_at': endAt, "id": batch }) bNotify.save(batch, eventType='spider_ok') mqExecute = deepcopy(execute) mqExecute[mqidKey] = mqExecute['id'] mqExecute[batchKey] = mqExecute['id'] Mq.produce([mqExecute], 'piping') if mqkey == 'mirror': result = db.updatebyid('task_execute', {'status': 4}, batch) mgdb.execute_save({'status': 4, "id": batch}) bNotify.save(batch, eventType='mirror_ok') if mqkey == 'piping': result = db.updatebyid('task_execute', {'status': 4}, batch) mgdb.execute_save({'status': 4, "id": batch}) bNotify.save(batch, 'piping_all', {'piping_status': 'ok'}) return finishIds
def execute_init(eid): '''本函数允许重复执行''' execute = db.getbyid('task_execute', eid) if not execute: return False execute['create_at'] = formatTimestamp( execute['create_at']) if execute['create_at'] else '' execute['update_at'] = formatTimestamp( execute['update_at']) if execute['update_at'] else '' execute['start_at'] = formatTimestamp( execute['start_at']) if execute['start_at'] else '' execute['end_at'] = formatTimestamp( execute['end_at']) if execute['end_at'] else '' execute['status'] = 101 mgExecute = mgdb.execute_getbyid(eid) if not mgExecute: execute_spider = deepcopy(execute) mgdb.c_insert('execute', execute_spider, autoid=False) startUrls = json.loads(execute['start_urls']) startUrlsLen = len(startUrls) urlCount = mongoSpider['spiderurl'].find({ 'execute_id': eid }, { '_id': 0 }).count() if startUrlsLen > urlCount: urlRows = [] for url in startUrls: urldata = { 'site_id': execute['site_id'], 'task_id': execute['task_id'], 'app_id': execute['app_id'], 'task_type': execute['task_type'], 'execute_id': eid, 'exec_level': execute['exec_level'], 'url': url, 'url_type': 'self', 'method': 'get', 'status': 0, 'create_at': now_format(), 'update_at': now_format(), } urlRows.append(urldata) mgdb.c_insert_batch('spiderurl', urlRows) undos = [ i for i in mongoSpider['spiderurl'].find({'execute_id': eid}, {'_id': 0}) ] undos_spider = [] undos_mirror = [] for undo in undos: undo[mqidKey] = undo['id'] undo[batchKey] = undo['execute_id'] undos_spider.append(undo) undos_mirror.append(undo) pre = 'mq_spider_' stages = ['undo', 'ready', 'doing', 'done'] stats = { stage: mongoMq[pre + stage].find({ batchKey: eid }).count() for stage in stages } total = stats['undo'] + stats['ready'] + stats['doing'] + stats['done'] if startUrlsLen > total: #添加spider队列 Mq.produce(undos_spider, 'spider') #添加mirror队列 if execute['task_type'] == 'mirror': Mq.produce(undos_mirror, 'mirror') if not mgExecute: db.updatebyid('task_execute', {'status': 101}, eid) return True
def task_delete_id(taskid): result = db.updatebyid('task', {'status': -1}, taskid)
def piping_save(rows=None,taskid=None): task = db.fetchone('select * from task where id=:id', {'id':taskid}) for row in rows: taskPiping = db.fetchone('select * from task_piping where task_id=:tid and type=:type', {'tid':taskid, 'type':row['type']}) extendId = 0 pipingExtendOld = None if taskPiping: extendId = taskPiping['extend_id'] pipingExtendOld = db.fetchone('select * from piping_extend where id=:id', {'id': extendId}) # wordId 值为0,则取系统默认词库 if row['type'] == 'darklink': pipingExtend={} pipingExtend['app_id'] = task['app_id'] pipingExtend['site_id'] = task['site_id'] pipingExtend['task_id'] = taskid pipingExtend['piping_type'] = row['type'] white_list = json.dumps(row['white_list'], ensure_ascii=False) if row['white_list'] else '[]' white_list = {'white_list':eval(white_list)} black_list = json.dumps(row['black_list'], ensure_ascii=False) if row['black_list'] else '[]' black_list = {'black_list':eval(black_list)} pipingExtend['data'] = json.dumps(dict(white_list, **black_list)) pipingExtend['status'] = 1 if pipingExtendOld: db.updatebyid('piping_extend', pipingExtend, extendId) else: extendId = db.insert('piping_extend', pipingExtend) if row['type'] == 'filterword' and 'filterwords' in row.keys() and 'filterword_operate' in row.keys(): words = [] wordsOld = [] wordsNew = row['filterwords'].replace(' ', '').split("\n") if pipingExtendOld: extendId = pipingExtendOld['id'] wordsOld = pipingExtendOld['data'].split("\n") if pipingExtendOld['data'] else [] # 覆盖自有词库 if row['filterword_operate'] == 'own': words = wordsNew # 加词 if row['filterword_operate'] == 'plus': words.extend(wordsNew) if wordsOld: words.extend(wordsOld) # 减词 if row['filterword_operate'] == 'reduce' and wordsOld: wordsCommon = list(set(wordsNew) & set(wordsOld)) for word in wordsCommon: wordsOld.remove(word) words = wordsOld if '' in words: words.remove('') words = list(set(words)) pipingExtend = {} pipingExtend['app_id'] = task['app_id'] pipingExtend['site_id'] = task['site_id'] pipingExtend['task_id'] = taskid pipingExtend['piping_type'] = row['type'] pipingExtend['data'] = "\n".join(words) pipingExtend['status'] = 1 if pipingExtendOld: db.updatebyid('piping_extend', pipingExtend, extendId) else: extendId = db.insert('piping_extend', pipingExtend) # 处理关键字 if row['type'] == 'keyword' and 'keywords' in row.keys(): pipingExtend = {} pipingExtend['app_id'] = task['app_id'] pipingExtend['site_id'] = task['site_id'] pipingExtend['task_id'] = taskid pipingExtend['piping_type'] = row['type'] pipingExtend['data'] = json.dumps(row['keywords'], ensure_ascii=False) if row['keywords'] else '' pipingExtend['status'] = 1 if pipingExtendOld: db.updatebyid('piping_extend', pipingExtend, extendId) else: extendId = db.insert('piping_extend', pipingExtend) # 处理错误状态吗 if row['type'] == 'error_http_code' and 'http_codes' in row.keys(): pipingExtend = {} pipingExtend['app_id'] = task['app_id'] pipingExtend['site_id'] = task['site_id'] pipingExtend['task_id'] = taskid pipingExtend['piping_type'] = row['type'] pipingExtend['data'] = row['http_codes'] pipingExtend['status'] = 1 if pipingExtendOld: db.updatebyid('piping_extend', pipingExtend, extendId) else: extendId = db.insert('piping_extend', pipingExtend) wordType = row['filterword_type'] if 'filterword_type' in row.keys() else '' status = row['status'] if 'status' in row.keys() else 1 piping = {} piping['status'] = status piping['extend_id'] = extendId piping['filterword_type'] = wordType if taskPiping: pipingId = db.updatebyid('task_piping', piping, taskPiping['id']) else: piping['app_id'] = task['app_id'] piping['site_id'] = task['site_id'] piping['task_id'] = taskid piping['type'] = row['type'] pipingId = db.insert('task_piping', piping) return True