Esempio n. 1
0
def result_getall_executeid(executeid = None):
    ''' 获取数据处理结果'''
    pipingResults = db.fetchall('select task_id,execute_id,type,result,create_at from task_piping_result where execute_id=:eid', {'eid':executeid})
    for row in pipingResults:
        row['create_at'] = formatTimestamp(row['create_at'])
        row['result'] = json.loads(row['result'])
    return pipingResults
Esempio n. 2
0
def getall():
    sql = "select * from task_notify order by id desc"
    rows = db.fetchall(sql)
    notifies = []
    for row in rows:
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        notifies.append(row)
    return notifies
Esempio n. 3
0
def mq_correct():
    '''运行时错误校正'''
    return True
    rawJson = redis.hget('mq_correct_running', 'checkdb')
    dataOld = json.loads(rawJson.decode()) if rawJson else {}
    dataNew = {}
    rows = db.fetchall(
        'select id,status from task_execute where status in(0,1,2) order by id asc;'
    )
    stages = ['undo', 'ready', 'doing', 'done']
    batchErrors = {}
    for row in rows:
        errors = []
        eid = str(row['id'])
        dataNew[eid] = 'ok'
        logger.debug("mq execute check ::::%s" % eid)

        if eid in dataOld.keys() and dataOld[eid] == 'ok': continue

        logger.debug("mq_correct_running checkdb::::%s" % eid)
        execute = mongoSpider['execute'].find_one({'id': row['id']},
                                                  {'_id': 0})
        if not execute: errors.append('noexecute')

        urlCount = mongoSpider['spiderurl'].find({
            'execute_id': row['id']
        }, {
            '_id': 0
        }).count()
        if not urlCount: errors.append('nourl')

        pre = 'mq_spider_'
        stats = {
            stage: mongoMq[pre + stage].find({
                'mq_batch': row['id']
            }).count()
            for stage in stages
        }
        total = stats['undo'] + stats['ready'] + stats['doing'] + stats['done']
        if not total: errors.append('nomq')

        if errors:
            batchErrors[eid] = errors
            dataNew[eid] = 'uncheck'
    idsDel = list(set(dataOld.keys()) - set(dataNew.keys()))
    for batch in idsDel:
        del (dataOld[batch])
    dataOld.update(dataNew)
    redis.hset('mq_correct_running', 'checkdb',
               json.dumps(dataOld, ensure_ascii=False))

    logger.debug("mq execute error ::::%s" %
                 json.dumps(errors, ensure_ascii=False))

    #url数据未写入
    for batch, errors in batchErrors.items():
        bTask.execute_init(batch)
Esempio n. 4
0
def getall():
    sql = "select * from setting"
    rows = db.fetchall(sql)
    settings = []
    for row in rows:
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        settings.append(row)

    return settings
Esempio n. 5
0
def getall():
    sql = "select * from proxy"
    rows = db.fetchall(sql)
    proxies = []
    for row in rows:
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        proxies.append(row)

    return proxies
Esempio n. 6
0
def getall():
    sql = "select * from app order by id desc"
    rows = db.fetchall(sql)
    apps = []
    for row in rows:
        row['token_expired'] = formatTimestamp(row['token_expired'])
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        apps.append(row)

    return apps
Esempio n. 7
0
def ApsCheckDb():
    rows = db.fetchall("select id,crontab from scheduler")
    dbRows = {}
    for row in rows:
        dbRows[row['id']] = row
    rows = sched.get_jobs()
    mRows = {}
    for job in rows:
        mRows[job.id] = job
    tasksNew = list(set(dbRows.keys()) - set(mRows.keys()))
    logger.debug("tasksNew::::%s" % ",".join(tasksNew))
    return True if tasksNew else False
Esempio n. 8
0
def piping_getall_taskid(taskid = None):
    ''' 获取数据处理通道'''
    pipings = {}
    taskPipings = db.fetchall('select * from task_piping where task_id=:id', {'id': taskid})
    for piping in taskPipings:
        piping['create_at'] = formatTimestamp(piping['create_at'])
        piping['update_at'] = formatTimestamp(piping['update_at'])
        pipingType = piping['type']
        if pipingType in ['filterword', 'keyword']:
            pipingExtend = db.fetchone('select * from piping_extend where id=:id', {'id': piping['extend_id']})
            piping['words'] = pipingExtend['data']
        pipings[pipingType] = piping
    return pipings
Esempio n. 9
0
def piping_filterword(executeid):
    '''敏感词过滤'''
    execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid})
    piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type':'filterword'})
    if not piping: return True

    #系统词库
    systemWords = ''
    if piping['filterword_type'] in ['system','mixed']:
        pipingExtend = db.fetchall('select name from sys_filterword')
        systemWords = [row['name'] for row in pipingExtend] if pipingExtend else []
    # 自有词库
    ownWords = ''
    if piping['filterword_type'] in ['own', 'mixed']:
        pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']})
        ownWords = pipingExtend['data'] if pipingExtend else ''
    words = []
    if piping['filterword_type'] == 'system':
        words = systemWords
    if piping['filterword_type'] == 'own':
        words = ownWords.split("\n")
    if piping['filterword_type'] == 'mixed':
        words = systemWords  + ownWords.split('\n')
    words = list(set(words))

    # print(words,type(words))
    if '' in words:
        words.remove('')
    if not words:
        return True

    acism = Acism(words)

    results = []
    rows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id','url', 'file_path', 'file_extension','url_type'])
    for row in rows:
        if row['url_type'] != 'self':continue
        if not (row['file_extension'] == 'html' or row['file_extension'] == ''): continue
        body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore')
        # body = open('demo.html', 'r').read()
        result = acism.scan(body)
        if result:
            filename = "snap_code_filterword_%s.png" % row['id']
            snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=result.keys())
            pipingResult = {"id":row['id'], "url":row['url'], "matches":result, 'snapshot':"\n".join(snapshots)}
            snapshot_insert(executeid, piping, row, pipingResult, snapshots)
            results.append(pipingResult)
    if results:
        return result_save(execute, piping, results)
    else:
        return True
Esempio n. 10
0
def execute_getall_taskid(taskid):
    rows = db.fetchall(
        'select * from task_execute where task_id=:task_id order by id desc',
        {'task_id': taskid})
    if not rows: return False
    executes = []
    for execute in rows:
        execute['start_at'] = formatTimestamp(
            execute['start_at']) if execute['start_at'] else ''
        execute['end_at'] = formatTimestamp(
            execute['end_at']) if execute['end_at'] else ''
        execute['create_at'] = formatTimestamp(execute['create_at'])
        execute['update_at'] = formatTimestamp(execute['update_at'])
        executes.append(execute)
    return executes
Esempio n. 11
0
def postgres2mongo(executeid):
    '''根据执行ID将数据从数据库转移到mongodb
    example:
        #for executeid in list(range(10000)):
        #    postgres2mongo(executeid)
    '''
    rows = db.fetchall('select * from spider_url where execute_id=:id',
                       {'id': executeid})
    if not rows: return False
    for row in rows:
        row['start_at'] = formatTimestamp(
            row['start_at']) if row['start_at'] else ''
        row['end_at'] = formatTimestamp(row['end_at']) if row['end_at'] else ''
        row['create_at'] = formatTimestamp(
            row['create_at']) if row['create_at'] else ''
        row['update_at'] = formatTimestamp(
            row['update_at']) if row['update_at'] else ''
    mongoSpider['spiderurl'].insert(rows)
    return True
Esempio n. 12
0
def task_getall(page=1, pagesize=20):
    page = _str2int(page) if page else 1
    pagesize = _str2int(pagesize) if pagesize else 20
    if page < 1: page = 1
    if pagesize < 1: pagesize = 20
    offset = (page - 1) * pagesize

    rows = db.fetchall(
        'select * from task order by id desc limit :limit offset :offset;', {
            'limit': pagesize,
            'offset': offset
        })
    if not rows: return False
    tasks = []
    for row in rows:
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        tasks.append(row)
    return tasks
Esempio n. 13
0
def process_initexec(index):
    '''校正数据'''
    mqkey = 'initexec'
    procname = "spider-%s-%s" % (mqkey, index)
    setproctitle(procname)

    from common import db, mongoMq
    import business.task as bTask

    start = time()
    while True:
        #如果停止,退出程序
        if time() - start > 10:
            if not _checkActive(mongoMq, procname[7:]): exit()
            start = time()
        rows = db.fetchall(
            "select id from task_execute where status=0 order by id asc")
        for row in rows:
            bTask.execute_init(row['id'])
        sleep(10)
Esempio n. 14
0
def piping_all(executeid):
    '''执行任务处理'''
    reData = {'status':0, 'msg':'', 'dopiping_executeid':executeid}
    row =  mgdb.execute_getbyid(executeid)
    if not row:
        return {'status':0, 'msg':'task_execute[%s] is not exists' % executeid, 'dopiping_executeid':executeid}
    types = db.fetchall("select type from task_piping where task_id=:id", {'id':row['task_id']})
    if not types:
        db.updatebyid('task_execute', {'status':'4'}, row['id'])
        return {'status':1, 'msg':'piping ok', 'dopiping_executeid':executeid}
    for row1 in types:
        pipingType = row1['type']
        if pipingType == 'filterword':
            piping_filterword(row['id'])
        if pipingType == 'keyword':
            piping_keyword(row['id'])
        if pipingType == 'error_http_code':
            piping_errorHttpCode(row['id'])
        if pipingType == 'fingerprint':
            result = piping_fingerprint(row['id'])
        if pipingType == 'darklink':
            result = piping_darklink(row['id'])
    return {'status':1, 'msg':'piping ok', 'dopiping_executeid':executeid}
Esempio n. 15
0
def piping_darklink(executeid):

    execute = mgdb.execute_getbyid(executeid)
    if not execute: return False
    piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type':'darklink'})
    if not piping: return True
    pipingExtend = db.fetchall('select name from sys_filterword')
    words = [row['name'] for row in pipingExtend] if pipingExtend else []
    # if not words: return True
    acism = Acism(words)
    #查询出系统黑白名单表里面的url
    rows = db.fetchall('select domain from dk_white_list')
    whites = [row['domain'] for row in rows] if rows else []
    rows = db.fetchall('select domain from dk_black_list')
    blacks = [row['domain'] for row in rows] if rows else []
    #拼接白名单和黑名单链接,并去重
    whites_glb = list(set(whites))
    blacks_glb = list(set(blacks))
    #查询出个人黑白名单表里面的url
    pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']})
    whites = eval(pipingExtend['data'])['white_list']
    blacks = eval(pipingExtend['data'])['black_list']
    whites_psl = list(set(whites))
    blacks_psl = list(set(blacks))
    mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url', 'md5_url', 'md5_body', 'url_type','file_extension','file_path', 'invisible','referer'])
    results = []
    for row in mgRows:
        # 匹配个人黑白名单里面的url
        if row['url'] in whites_psl:continue
        if row['url'] in blacks_psl:
            words = [params['darklink'] for params in results] if results else []
            if row['url'] not in words:
                filename = 'snap_code_darklink_%s.png' % row['id']
                snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']])
                result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'absolute','snapshot':"\n".join(snapshots)}
                snapshot_insert(executeid, piping, row, result, snapshots)
                results.append(result)
                continue
        # 静态文件不是暗链
        if row['url_type'] != 'other':continue
        if row['file_extension'] not in ['', 'html']:continue
        # 匹配系统黑白名单(判定结果疑似度百分百 absolute)
        if row['url'] in whites_glb:continue
        if row['url'] in blacks_glb:
            words = [params['darklink'] for params in results] if results else []
            if row['url'] not in words:
                filename = 'snap_code_darklink_%s.png' % row['id']
                snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']])
                result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'absolute','snapshot':"\n".join(snapshots)}
                snapshot_insert(executeid, piping, row, result, snapshots)
                results.append(result)
                continue
        # 敏感词检测(判定结果疑似度高 high)
        # if words:
        body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore')
        resultWord = acism.scan(body)
        if resultWord:
            words = [params['darklink'] for params in results] if results else []
            if row['url'] not in words:
                filename = 'snap_code_darklink_%s.png' % row['id']
                snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=[row['url']])
                result = {'id': row['id'], 'referer': row['referer'], 'darklink': row['url'], 'level': 'high','snapshot': "\n".join(snapshots)}
                snapshot_insert(executeid, piping, row, result, snapshots)
                results.append(result)
                finddata = {'domain':execute['domain'], 'md5_body':row['md5_body']}
                setdata = {'$set':{'filterwords':json.dumps(resultWord, ensure_ascii=False)}}
                mongoSpider['outlink'].find_and_modify(finddata, setdata)
                continue
        # 检测是否可见(判定结果疑似度低 low)
        if row['invisible']:
            words = [params['darklink'] for params in results] if results else []
            if row['url'] not in words:
                filename = 'snap_code_darklink_%s.png' % row['id']
                snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']])
                result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'low','snapshot':"\n".join(snapshots)}
                snapshot_insert(executeid, piping, row, result, snapshots)
                results.append(result)
                continue
        # # 检测是否重复(超过引用阈值疑似度高 high/没有超过引用阈值疑似度中 medium)
        # if row['file_extension']:
        #     body = urlparse()
        # # 检测引用次数    @未严格定义,待定
        # match = {'$match':{'md5_url':row['md5_url']}}
        # group = {'$group':{'_id':'$domain', 'count':{'$sum':1}}}
        # results = [i for i in mongoSpider['outlink'].aggregate([match, group])]
        # if len(results) > 500:
    if results:return result_save(execute, piping, results) if results else True