Example #1
0
def getToken_key(key):
    appObj = db.fetchone('select * from app where unique_key=:key', {'key':key})
    if not appObj:
        return False

    tokenExpired = getTime("%Y-%m-%d %H:%M:%S" , (getTime() + 7200))
    token = md5(tokenExpired)
    db.updatebyid('app', {'token':token, 'token_expired':tokenExpired}, appObj['id'])
    return {'token':token, 'expired':tokenExpired}
Example #2
0
def crawljs(taskInfo):
    try:
        #已抓取过,不再抓取
        if taskInfo['status'] not in (0, 1):
            return True
        #抓取页面源代码
        requestInfo = spiderRequest(taskInfo['url'])

        parseResults = []
        results = _parseForJs(taskInfo['url'])
        for record in results:
            urlRow = _parseForUrl(record)
            if not urlRow: continue
            parseResults.append(urlRow)
        updateRow = {}
        updateRow['id'] = taskInfo['id']
        updateRow['http_code'] = requestInfo['http_code']
        updateRow['response_headers'] = json.dumps(
            requestInfo['response_headers'], ensure_ascii=False)
        updateRow['body'] = requestInfo['body']
        updateRow['md5_body'] = md5(requestInfo['body'])
        updateRow['parse_result'] = json.dumps(parseResults,
                                               ensure_ascii=False)
        updateRow['status'] = 2
        updateRow['end_at'] = getTime('%Y-%m-%d %H:%M:%S')
        #保存数据结果
        mg_spiderjsurl_save(updateRow)
    except Exception as e:
        logger.exception(e)
        return False
Example #3
0
def execCasper(content=None):
    try:
        filename = "%s/%s_%s" % (PATH_TMP_NODEJS, getTime('%Y%m%d'),
                                 md5(content))
        write(filename, content)
        cmd = 'casperjs ' + filename
        child = Popen(cmd,
                      shell=True,
                      close_fds=True,
                      bufsize=-1,
                      stdout=PIPE,
                      stderr=STDOUT)
        output = child.stdout.read().decode()
        #remove(filename)
        return output
    except Exception as e:
        logger.exception(e)
        return False
Example #4
0
def generate(urlid):
    row = mgdb.spiderurl_getbyid(urlid)
    if not row:
        return {
            'status': 0,
            'msg': 'spider_url[%s] is not exists' % urlid,
            'domirror_urlid': urlid
        }

    try:
        command = "phantomjs --ignore-ssl-errors=true --proxy=%s %s/mirror.js %s" % (
            MIRROR_PROXY, PATH_NODEJS, row['url'])
        child = subprocess.Popen(command,
                                 shell=True,
                                 close_fds=True,
                                 bufsize=-1,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT)
        lines = child.stdout.readlines()
        output = getTime('%Y%m%d%H%M%S') + "\t" + command
        processOutputs = []
        for line in lines:
            processOutputs.append(line.decode().strip())
        if child.poll():
            mgdb.spiderurl_save({'status': 401}, row['id'])
            logger.debug("failed  %s\n%s\n%s" %
                         (row['url'], command, "\n".join(processOutputs)))
        else:
            mgdb.spiderurl_save({'status': 4}, row['id'])
            logger.debug("success %s\n%s\n%s" %
                         (row['url'], command, "\n".join(processOutputs)))
        return {'status': 1, 'msg': 'mirror ok', 'domirror_urlid': urlid}
    except Exception as e:
        mgdb.spiderurl_save({'status': 402, 'error': repr(e)}, row['id'])
        logger.error("doMirror::" + str(urlid) + "::" + repr(e))
        return {'status': 1, 'msg': repr(e), 'domirror_urlid': urlid}
Example #5
0
 def test4_time_int(self):
     now = getTime(None, '20170101')
     #print(now)
     result = isinstance(now, (int))
     self.assertTrue(result)
Example #6
0
 def test3_time_str(self):
     now = getTime('%Y-%m-%d', '20170101')
     #print(now)
     result = isinstance(now, (str))
     self.assertTrue(result)
Example #7
0
 def test2_now_str(self):
     now = getTime('%Y%m%d')
     #print(now)
     result = isinstance(now, (str))
     self.assertTrue(result)
Example #8
0
 def test1_now(self):
     now = getTime()
     result = isinstance(now, (int))
     self.assertTrue(result)
Example #9
0
        now = getTime()
        result = isinstance(now, (int))
        self.assertTrue(result)

    def test2_now_str(self):
        now = getTime('%Y%m%d')
        #print(now)
        result = isinstance(now, (str))
        self.assertTrue(result)

    def test3_time_str(self):
        now = getTime('%Y-%m-%d', '20170101')
        #print(now)
        result = isinstance(now, (str))
        self.assertTrue(result)

    def test4_time_int(self):
        now = getTime(None, '20170101')
        #print(now)
        result = isinstance(now, (int))
        self.assertTrue(result)


params = {}
params['execute_delay'] = 3600
print(getTime())
now = getTime() + params['execute_delay']
print(now, type(now) == int)
dateStr = getTime('%Y%m%d %H%M%S', now)
print(dateStr)
Example #10
0
def task_save(params=None):
    if not params['id'] and not params['start_urls']: return False
    if params['start_urls']:
        startUrls = params['start_urls'].split("\n")
        params['start_urls'] = json.dumps(startUrls, ensure_ascii=False)
    else:
        params['start_urls'] = ''

    #默认值
    defaultKeys = {
        'app_id': 0,
        'type': 'spider',
        'start_urls': '',
        'exec_level': 0,
        'limit_depth': 2,
        'limit_total': 1000,
        'limit_time': 0,
        'limit_subdomain': 0,
        'limit_image': 0,
        'limit_js': 0,
        'url_unique_mode': 'url-query',
        'notify_url': '',
        'exec_level': 0,
        'source_ip': '',
        'exclude_urls': '',
        'proxies': '',
        'crontab': '',
        'status': 0,
    }

    #处理定时任务
    rundate = None
    if 'execute_at' in params.keys() and params['execute_at']:
        rundate = datetime.strptime(params['execute_at'], '%Y-%m-%d %H:%M:%S')

    if 'execute_delay' in params.keys() and params['execute_delay']:
        rundateStr = getTime('%Y-%m-%d %H:%M:%S',
                             getTime() + params['execute_delay'])
        rundate = datetime.strptime(rundateStr, '%Y-%m-%d %H:%M:%S')

    #保存数据
    taskdata = {}
    keys = defaultKeys.keys()
    if params['id']:
        taskid = params['id']
        for key in keys:
            if key in params.keys() and params[key]:
                taskdata[key] = params[key]
        result = db.updatebyid('task', taskdata, taskid)
    else:
        taskdata['site_id'] = _getSiteid(startUrls[0])
        for key in keys:
            if key in params.keys() and params[key]:
                taskdata[key] = params[key]
            else:
                taskdata[key] = defaultKeys[key]
        taskid = db.insert('task', taskdata)

    #定时任务
    func_name = task_start
    jobid = 'task_%s' % taskid
    if rundate:
        job = db.getbyid('scheduler', jobid)
        if job:
            db.updatebyid('scheduler', {'run_date': rundate}, jobid)
        else:
            scheduler = {
                'id': jobid,
                'name': jobid,
                'func': 'business.task:task_start',
                'args': '[' + str(taskid) + ']',
                'trigger_type': 'date',
                'run_date': rundate,
                'coalesce': 0,
                'next_run_time': rundate,
                'max_instances': 3,
                'executor': 'default',
                'misfire_grace_time ': 1,
            }
            db.insert(scheduler)
        return taskid

    #非计划任务
    task = db.fetchone("select * from task where id=:id", {'id': taskid})
    if not task['crontab']:
        task_start(taskid)
        return taskid

    #删除计划任务
    if taskdata['status'] < 1 and taskdata['crontab']:
        db.exec('delete from scheduler where id=:id', {'id': jobid})
        return taskid

    #添加或修改计划任务
    job = db.getbyid('scheduler', jobid)
    cs = params['crontab'].split(' ')
    if job:
        crontab = '0 ' + task['crontab'] + ' * *,SMHdmwWY'
        db.updatebyid('scheduler', {'crontab': crontab}, jobid)
    else:
        tz = pytz.timezone('Asia/Shanghai')
        scheduler = {
            'id': jobid,
            'name': jobid,
            'func': 'business.task:task_start',
            'args': '[' + str(taskid) + ']',
            'trigger_type': 'cron',
            'crontab': '0 ' + task['crontab'] + ' * *,SMHdmwWY',
            'coalesce': 0,
            'next_run_time':
            datetime.now(tz=tz).strftime('%Y-%m-%d %H:%M:%S%z'),
            'max_instances': 3,
            'executor': 'default',
            'misfire_grace_time ': 1,
        }
        db.insert('scheduler', scheduler)

    return taskid
Example #11
0
def execute_finish():
    '''蜘蛛是否结束'''
    finishIds = []
    #查询所有未执行中的批次,进行检查
    for row in mongoMq['stats_batch_run'].find({'is_end': 0}, {
            "_id": 0,
            'mqkey': 1,
            'batch': 1
    }):
        mqkey = row['mqkey']
        batch = row['batch']
        stats = mongoMq['stats_batch_stage'].find_one(
            {
                "mqkey": mqkey,
                'batch': batch
            }, {"_id": 0})
        if not stats:
            logger.error("no stats_batch_stage::::%s::::%s" % (mqkey, batch))
            continue
        if (not stats['undo'] and not stats['ready'] and not stats['doing']
                and stats['done'] and stats['done'] == stats['total']):
            endAt = getTime('%Y-%m-%d %H:%M:%S')
            mongoMq['stats_batch_stage'].update(
                {
                    "mqkey": mqkey,
                    'batch': batch
                }, {"$set": {
                    'end': 1
                }})
            mongoMq['stats_batch_run'].update({
                "mqkey": mqkey,
                'batch': batch
            }, {"$set": {
                'is_end': 1,
                'end_at': endAt
            }})
            finishIds.append(batch)
            #抓取
            if mqkey == 'spider':
                spiderResult = {'ok': 0, 'failed': 0, 'error': []}
                execute = mongoSpider['execute'].find_one({'id': batch},
                                                          {'_id': 0})
                limit = len(json.loads(execute['start_urls']))
                for row in mongoSpider['spiderurl'].find({
                        'execute_id': batch
                }, {
                        '_id': 0
                }).sort([("id", 1)]).limit(limit):
                    if row['error']:
                        spiderResult['failed'] = spiderResult['failed'] + 1
                        spiderResult['error'].append(row['error'])
                    else:
                        spiderResult['ok'] = spiderResult['ok'] + 1
                #如果有超过一半页面抓取失败,则抓取失败
                if spiderResult['failed'] > limit / 2:
                    result = db.updatebyid(
                        'task_execute', {
                            'status': 3,
                            'end_at': endAt,
                            'error': "\n".join(spiderResult['error'])
                        }, batch)
                    mgdb.execute_save({
                        'status': 2,
                        'end_at': endAt,
                        'error': "\n".join(spiderResult['error']),
                        "id": batch
                    })
                    bNotify.save(batch, eventType='spider_failed')
                else:
                    result = db.updatebyid('task_execute', {
                        'status': 2,
                        'end_at': endAt
                    }, batch)
                    mgdb.execute_save({
                        'status': 2,
                        'end_at': endAt,
                        "id": batch
                    })
                    bNotify.save(batch, eventType='spider_ok')
                    mqExecute = deepcopy(execute)
                    mqExecute[mqidKey] = mqExecute['id']
                    mqExecute[batchKey] = mqExecute['id']
                    Mq.produce([mqExecute], 'piping')

            if mqkey == 'mirror':
                result = db.updatebyid('task_execute', {'status': 4}, batch)
                mgdb.execute_save({'status': 4, "id": batch})
                bNotify.save(batch, eventType='mirror_ok')

            if mqkey == 'piping':
                result = db.updatebyid('task_execute', {'status': 4}, batch)
                mgdb.execute_save({'status': 4, "id": batch})
                bNotify.save(batch, 'piping_all', {'piping_status': 'ok'})
    return finishIds