def result_getall_executeid(executeid = None): ''' 获取数据处理结果''' pipingResults = db.fetchall('select task_id,execute_id,type,result,create_at from task_piping_result where execute_id=:eid', {'eid':executeid}) for row in pipingResults: row['create_at'] = formatTimestamp(row['create_at']) row['result'] = json.loads(row['result']) return pipingResults
def getall(): sql = "select * from task_notify order by id desc" rows = db.fetchall(sql) notifies = [] for row in rows: row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) notifies.append(row) return notifies
def mq_correct(): '''运行时错误校正''' return True rawJson = redis.hget('mq_correct_running', 'checkdb') dataOld = json.loads(rawJson.decode()) if rawJson else {} dataNew = {} rows = db.fetchall( 'select id,status from task_execute where status in(0,1,2) order by id asc;' ) stages = ['undo', 'ready', 'doing', 'done'] batchErrors = {} for row in rows: errors = [] eid = str(row['id']) dataNew[eid] = 'ok' logger.debug("mq execute check ::::%s" % eid) if eid in dataOld.keys() and dataOld[eid] == 'ok': continue logger.debug("mq_correct_running checkdb::::%s" % eid) execute = mongoSpider['execute'].find_one({'id': row['id']}, {'_id': 0}) if not execute: errors.append('noexecute') urlCount = mongoSpider['spiderurl'].find({ 'execute_id': row['id'] }, { '_id': 0 }).count() if not urlCount: errors.append('nourl') pre = 'mq_spider_' stats = { stage: mongoMq[pre + stage].find({ 'mq_batch': row['id'] }).count() for stage in stages } total = stats['undo'] + stats['ready'] + stats['doing'] + stats['done'] if not total: errors.append('nomq') if errors: batchErrors[eid] = errors dataNew[eid] = 'uncheck' idsDel = list(set(dataOld.keys()) - set(dataNew.keys())) for batch in idsDel: del (dataOld[batch]) dataOld.update(dataNew) redis.hset('mq_correct_running', 'checkdb', json.dumps(dataOld, ensure_ascii=False)) logger.debug("mq execute error ::::%s" % json.dumps(errors, ensure_ascii=False)) #url数据未写入 for batch, errors in batchErrors.items(): bTask.execute_init(batch)
def getall(): sql = "select * from setting" rows = db.fetchall(sql) settings = [] for row in rows: row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) settings.append(row) return settings
def getall(): sql = "select * from proxy" rows = db.fetchall(sql) proxies = [] for row in rows: row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) proxies.append(row) return proxies
def getall(): sql = "select * from app order by id desc" rows = db.fetchall(sql) apps = [] for row in rows: row['token_expired'] = formatTimestamp(row['token_expired']) row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) apps.append(row) return apps
def ApsCheckDb(): rows = db.fetchall("select id,crontab from scheduler") dbRows = {} for row in rows: dbRows[row['id']] = row rows = sched.get_jobs() mRows = {} for job in rows: mRows[job.id] = job tasksNew = list(set(dbRows.keys()) - set(mRows.keys())) logger.debug("tasksNew::::%s" % ",".join(tasksNew)) return True if tasksNew else False
def piping_getall_taskid(taskid = None): ''' 获取数据处理通道''' pipings = {} taskPipings = db.fetchall('select * from task_piping where task_id=:id', {'id': taskid}) for piping in taskPipings: piping['create_at'] = formatTimestamp(piping['create_at']) piping['update_at'] = formatTimestamp(piping['update_at']) pipingType = piping['type'] if pipingType in ['filterword', 'keyword']: pipingExtend = db.fetchone('select * from piping_extend where id=:id', {'id': piping['extend_id']}) piping['words'] = pipingExtend['data'] pipings[pipingType] = piping return pipings
def piping_filterword(executeid): '''敏感词过滤''' execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid}) piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type':'filterword'}) if not piping: return True #系统词库 systemWords = '' if piping['filterword_type'] in ['system','mixed']: pipingExtend = db.fetchall('select name from sys_filterword') systemWords = [row['name'] for row in pipingExtend] if pipingExtend else [] # 自有词库 ownWords = '' if piping['filterword_type'] in ['own', 'mixed']: pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']}) ownWords = pipingExtend['data'] if pipingExtend else '' words = [] if piping['filterword_type'] == 'system': words = systemWords if piping['filterword_type'] == 'own': words = ownWords.split("\n") if piping['filterword_type'] == 'mixed': words = systemWords + ownWords.split('\n') words = list(set(words)) # print(words,type(words)) if '' in words: words.remove('') if not words: return True acism = Acism(words) results = [] rows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id','url', 'file_path', 'file_extension','url_type']) for row in rows: if row['url_type'] != 'self':continue if not (row['file_extension'] == 'html' or row['file_extension'] == ''): continue body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore') # body = open('demo.html', 'r').read() result = acism.scan(body) if result: filename = "snap_code_filterword_%s.png" % row['id'] snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=result.keys()) pipingResult = {"id":row['id'], "url":row['url'], "matches":result, 'snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, pipingResult, snapshots) results.append(pipingResult) if results: return result_save(execute, piping, results) else: return True
def execute_getall_taskid(taskid): rows = db.fetchall( 'select * from task_execute where task_id=:task_id order by id desc', {'task_id': taskid}) if not rows: return False executes = [] for execute in rows: execute['start_at'] = formatTimestamp( execute['start_at']) if execute['start_at'] else '' execute['end_at'] = formatTimestamp( execute['end_at']) if execute['end_at'] else '' execute['create_at'] = formatTimestamp(execute['create_at']) execute['update_at'] = formatTimestamp(execute['update_at']) executes.append(execute) return executes
def postgres2mongo(executeid): '''根据执行ID将数据从数据库转移到mongodb example: #for executeid in list(range(10000)): # postgres2mongo(executeid) ''' rows = db.fetchall('select * from spider_url where execute_id=:id', {'id': executeid}) if not rows: return False for row in rows: row['start_at'] = formatTimestamp( row['start_at']) if row['start_at'] else '' row['end_at'] = formatTimestamp(row['end_at']) if row['end_at'] else '' row['create_at'] = formatTimestamp( row['create_at']) if row['create_at'] else '' row['update_at'] = formatTimestamp( row['update_at']) if row['update_at'] else '' mongoSpider['spiderurl'].insert(rows) return True
def task_getall(page=1, pagesize=20): page = _str2int(page) if page else 1 pagesize = _str2int(pagesize) if pagesize else 20 if page < 1: page = 1 if pagesize < 1: pagesize = 20 offset = (page - 1) * pagesize rows = db.fetchall( 'select * from task order by id desc limit :limit offset :offset;', { 'limit': pagesize, 'offset': offset }) if not rows: return False tasks = [] for row in rows: row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) tasks.append(row) return tasks
def process_initexec(index): '''校正数据''' mqkey = 'initexec' procname = "spider-%s-%s" % (mqkey, index) setproctitle(procname) from common import db, mongoMq import business.task as bTask start = time() while True: #如果停止,退出程序 if time() - start > 10: if not _checkActive(mongoMq, procname[7:]): exit() start = time() rows = db.fetchall( "select id from task_execute where status=0 order by id asc") for row in rows: bTask.execute_init(row['id']) sleep(10)
def piping_all(executeid): '''执行任务处理''' reData = {'status':0, 'msg':'', 'dopiping_executeid':executeid} row = mgdb.execute_getbyid(executeid) if not row: return {'status':0, 'msg':'task_execute[%s] is not exists' % executeid, 'dopiping_executeid':executeid} types = db.fetchall("select type from task_piping where task_id=:id", {'id':row['task_id']}) if not types: db.updatebyid('task_execute', {'status':'4'}, row['id']) return {'status':1, 'msg':'piping ok', 'dopiping_executeid':executeid} for row1 in types: pipingType = row1['type'] if pipingType == 'filterword': piping_filterword(row['id']) if pipingType == 'keyword': piping_keyword(row['id']) if pipingType == 'error_http_code': piping_errorHttpCode(row['id']) if pipingType == 'fingerprint': result = piping_fingerprint(row['id']) if pipingType == 'darklink': result = piping_darklink(row['id']) return {'status':1, 'msg':'piping ok', 'dopiping_executeid':executeid}
def piping_darklink(executeid): execute = mgdb.execute_getbyid(executeid) if not execute: return False piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type':'darklink'}) if not piping: return True pipingExtend = db.fetchall('select name from sys_filterword') words = [row['name'] for row in pipingExtend] if pipingExtend else [] # if not words: return True acism = Acism(words) #查询出系统黑白名单表里面的url rows = db.fetchall('select domain from dk_white_list') whites = [row['domain'] for row in rows] if rows else [] rows = db.fetchall('select domain from dk_black_list') blacks = [row['domain'] for row in rows] if rows else [] #拼接白名单和黑名单链接,并去重 whites_glb = list(set(whites)) blacks_glb = list(set(blacks)) #查询出个人黑白名单表里面的url pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']}) whites = eval(pipingExtend['data'])['white_list'] blacks = eval(pipingExtend['data'])['black_list'] whites_psl = list(set(whites)) blacks_psl = list(set(blacks)) mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url', 'md5_url', 'md5_body', 'url_type','file_extension','file_path', 'invisible','referer']) results = [] for row in mgRows: # 匹配个人黑白名单里面的url if row['url'] in whites_psl:continue if row['url'] in blacks_psl: words = [params['darklink'] for params in results] if results else [] if row['url'] not in words: filename = 'snap_code_darklink_%s.png' % row['id'] snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']]) result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'absolute','snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, result, snapshots) results.append(result) continue # 静态文件不是暗链 if row['url_type'] != 'other':continue if row['file_extension'] not in ['', 'html']:continue # 匹配系统黑白名单(判定结果疑似度百分百 absolute) if row['url'] in whites_glb:continue if row['url'] in blacks_glb: words = [params['darklink'] for params in results] if results else [] if row['url'] not in words: filename = 'snap_code_darklink_%s.png' % row['id'] snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']]) result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'absolute','snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, result, snapshots) results.append(result) continue # 敏感词检测(判定结果疑似度高 high) # if words: body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore') resultWord = acism.scan(body) if resultWord: words = [params['darklink'] for params in results] if results else [] if row['url'] not in words: filename = 'snap_code_darklink_%s.png' % row['id'] snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=[row['url']]) result = {'id': row['id'], 'referer': row['referer'], 'darklink': row['url'], 'level': 'high','snapshot': "\n".join(snapshots)} snapshot_insert(executeid, piping, row, result, snapshots) results.append(result) finddata = {'domain':execute['domain'], 'md5_body':row['md5_body']} setdata = {'$set':{'filterwords':json.dumps(resultWord, ensure_ascii=False)}} mongoSpider['outlink'].find_and_modify(finddata, setdata) continue # 检测是否可见(判定结果疑似度低 low) if row['invisible']: words = [params['darklink'] for params in results] if results else [] if row['url'] not in words: filename = 'snap_code_darklink_%s.png' % row['id'] snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']]) result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'low','snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, result, snapshots) results.append(result) continue # # 检测是否重复(超过引用阈值疑似度高 high/没有超过引用阈值疑似度中 medium) # if row['file_extension']: # body = urlparse() # # 检测引用次数 @未严格定义,待定 # match = {'$match':{'md5_url':row['md5_url']}} # group = {'$group':{'_id':'$domain', 'count':{'$sum':1}}} # results = [i for i in mongoSpider['outlink'].aggregate([match, group])] # if len(results) > 500: if results:return result_save(execute, piping, results) if results else True