def project_update():
    stat_url = request.form['stat_url']
    project_id = request.form['project_id']
    project = Project.find_project_by_id(project_id)
    project.stat_url = stat_url
    db.session.commit()
    return redirect("/project/manage", code=302)
Beispiel #2
0
 def start_spider(self, job_instance):
     project = Project.find_project_by_id(job_instance.project_id)
     spider_name = job_instance.spider_name
     arguments = {}
     if job_instance.spider_arguments:
         arguments = dict(
             map(lambda x: x.split("="),
                 job_instance.spider_arguments.split(",")))
     threshold = 0
     daemon_size = len(self.spider_service_instances)
     if job_instance.priority == JobPriority.HIGH:
         threshold = int(daemon_size / 2)
     if job_instance.priority == JobPriority.HIGHEST:
         threshold = int(daemon_size)
     threshold = 1 if threshold == 0 else threshold
     candidates = self.spider_service_instances
     leaders = []
     # TODO optimize some better func to vote the leader
     for i in range(threshold):
         leaders.append(random.choice(candidates))
     for leader in leaders:
         serviec_job_id = leader.start_spider(project.project_name,
                                              spider_name, arguments)
         job_execution = JobExecution()
         job_execution.project_id = job_instance.project_id
         job_execution.service_job_execution_id = serviec_job_id
         job_execution.job_instance_id = job_instance.id
         job_execution.create_time = datetime.datetime.now()
         job_execution.running_on = leader.server
         db.session.add(job_execution)
         db.session.commit()
Beispiel #3
0
 def log_url(self, job_execution):
     job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
     project = Project.find_project_by_id(job_instance.project_id)
     for spider_service_instance in self.spider_service_instances:
         if spider_service_instance.server == job_execution.running_on:
             return spider_service_instance.log_url(project.project_name, job_instance.spider_name,
                                                    job_execution.service_job_execution_id)
Beispiel #4
0
def job_periodic(project_id):
    project = Project.find_project_by_id(project_id)
    job_instance_list = [job_instance.to_dict() for job_instance in
                         JobInstance.query.filter_by(run_type="periodic", project_id=project_id).order_by(
                             JobInstance.id).all()]  # ADD order_by
    return render_template("job_periodic.html",
                           job_instance_list=job_instance_list)
Beispiel #5
0
 def log_url(self, job_execution):
     job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
     project = Project.find_project_by_id(job_instance.project_id)
     for spider_service_instance in self.spider_service_instances:
         if spider_service_instance.server == job_execution.running_on:
             return spider_service_instance.log_url(project.project_name, job_instance.spider_name,
                                                    job_execution.service_job_execution_id)
Beispiel #6
0
def download_items(project_id, job_exec_id):
    format = request.args.get('format')
    if not format in ['json', 'csv']:
        abort(404)

    job_execution = JobExecution.query.filter_by(project_id=project_id,
                                                 id=job_exec_id).first()

    job_instance = JobInstance.find_job_instance_by_id(
        job_execution.job_instance_id)
    project = Project.find_project_by_id(job_instance.project_id)

    res = requests.get(agent.items_url(job_execution))
    res.encoding = 'utf8'
    json_data = [json.loads(s) for s in filter(None, res.text.split('\n'))]

    filename = '{}-{}.{}'.format(project.project_name,
                                 job_instance.spider_name, format)
    if format == 'json':
        open(os.path.join(app.static_folder, filename),
             'w').write(json.dumps(json_data))
    elif format == 'csv':
        f = open(os.path.join(app.static_folder, filename), 'w')
        csvwriter = csv.writer(f)
        count = 0
        for item in json_data:
            if count == 0:
                header = item.keys()
                csvwriter.writerow(header)
                count += 1
            csvwriter.writerow(item.values())
        f.close()

    return send_from_directory(app.static_folder, filename, as_attachment=True)
Beispiel #7
0
 def get(self, project_id):
     project = Project.find_project_by_id(project_id)
     return [
         spider_instance.to_dict()
         for spider_instance in SpiderInstance.query.filter_by(
             project_id=project_id).all()
     ]
Beispiel #8
0
def job_add(project_id):
    project = Project.find_project_by_id(project_id)
    job_instance = JobInstance()
    job_instance.spider_name = request.form['spider_name']
    job_instance.project_id = project_id
    job_instance.spider_arguments = request.form['spider_arguments']
    job_instance.priority = request.form.get('priority', 0)
    job_instance.run_type = request.form['run_type']
    # chose daemon manually
    if request.form['daemon'] != 'auto':
        spider_args = []
        if request.form['spider_arguments']:
            spider_args = request.form['spider_arguments'].split(",")
        spider_args.append("daemon={}".format(request.form['daemon']))
        job_instance.spider_arguments = ','.join(spider_args)
    if job_instance.run_type == JobRunType.ONETIME:
        job_instance.enabled = -1
        db.session.add(job_instance)
        db.session.commit()
        agent.start_spider(job_instance)
    if job_instance.run_type == JobRunType.PERIODIC:
        job_instance.cron_minutes = request.form.get('cron_minutes') or '0'
        job_instance.cron_hour = request.form.get('cron_hour') or '*'
        job_instance.cron_day_of_month = request.form.get(
            'cron_day_of_month') or '*'
        job_instance.cron_day_of_week = request.form.get(
            'cron_day_of_week') or '*'
        job_instance.cron_month = request.form.get('cron_month') or '*'
        # set cron exp manually
        if request.form.get('cron_exp'):
            job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_day_of_week, job_instance.cron_month = \
                request.form['cron_exp'].split(' ')
        db.session.add(job_instance)
        db.session.commit()
    return redirect(request.referrer, code=302)
    def log_url_slave(self, job_execution):
        """
        功能: 获取从爬虫的日志,只要获取一个
        :param job_execution: job_execution对象
        :return: 返回log的url
        """
        job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
        project = Project.find_project_by_id(job_instance.project_id)
        # 主从爬虫运行的服务器字符串
        service_job_execution_id = job_execution.service_job_execution_id.split('>')
        #  从爬虫服务器列表
        slave_service_job_execution_id = service_job_execution_id[1].split(',')
        # 爬虫运行的服务器地址
        running_on = job_execution.running_on.split('>')
        slave_running_on = running_on[1].split(',')
        # 调用从爬虫的日志
        spider_name_slave_obj = SpiderInstance.query.filter_by(
            spider_name=job_instance.spider_name,
            project_id=job_instance.project_id).first()
        spider_name_slave = spider_name_slave_obj.spider_name_slave

        for spider_service_instance in self.spider_service_instances_slave:
            for job_execution_id, running_on_ in zip(slave_service_job_execution_id, slave_running_on):
                if spider_service_instance.server == running_on_:
                    slave_log_url = spider_service_instance.log_url(
                        project.project_name, spider_name_slave,
                        job_execution_id)
                    return slave_log_url
Beispiel #10
0
def job_add(project_id):
    # Save the upload file, and save the file path to the
    # job_instance.spider_arguments
    dst = ''
    if 'file' in request.files:
        file = request.files['file']
        # if user does not select file, browser also
        # submit a empty part without filename
        if file.filename == '':
            pass
        if file and allowed_seed(file.filename):
            filename = secure_filename(file.filename)
            dst = os.path.join(
                app.config['UPLOAD_DIR'],
                datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-") +
                filename)
            file.save(dst)
    project = Project.find_project_by_id(project_id)
    job_instance = JobInstance()
    job_instance.spider_name = request.form['spider_name']
    job_instance.project_id = project_id
    job_instance.spider_arguments = request.form['spider_arguments']
    if dst:
        if job_instance.spider_arguments:
            job_instance.spider_arguments += (",seed={}".format(dst))
        else:
            job_instance.spider_arguments = "seed={}".format(dst)

    job_instance.priority = request.form.get('priority', 0)
    job_instance.run_type = request.form['run_type']
    # chose daemon manually
    if request.form['daemon'] != 'auto':
        spider_args = []
        if request.form['spider_arguments']:
            spider_args = request.form['spider_arguments'].split(",")
        spider_args.append("daemon={}".format(request.form['daemon']))
        job_instance.spider_arguments = ','.join(spider_args)
    if job_instance.run_type == JobRunType.ONETIME:
        job_instance.enabled = -1
        db.session.add(job_instance)
        db.session.commit()
        agent.start_spider(job_instance)
    if job_instance.run_type == JobRunType.PERIODIC:
        job_instance.cron_minutes = request.form.get('cron_minutes') or '0'
        job_instance.cron_hour = request.form.get('cron_hour') or '*'
        job_instance.cron_day_of_month = request.form.get(
            'cron_day_of_month') or '*'
        job_instance.cron_day_of_week = request.form.get(
            'cron_day_of_week') or '*'
        job_instance.cron_month = request.form.get('cron_month') or '*'
        # set cron exp manually
        if request.form.get('cron_exp'):
            job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_day_of_week, job_instance.cron_month = \
                request.form['cron_exp'].split(' ')
        db.session.add(job_instance)
        db.session.commit()
    return redirect(request.referrer, code=302)
Beispiel #11
0
 def cancel_spider(self, job_execution):
     job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
     project = Project.find_project_by_id(job_instance.project_id)
     for spider_service_instance in self.spider_service_instances:
         if spider_service_instance.server == job_execution.running_on:
             if spider_service_instance.cancel_spider(project.project_name, job_execution.service_job_execution_id):
                 job_execution.end_time = datetime.datetime.now()
                 job_execution.running_status = SpiderStatus.CANCELED
                 db.session.commit()
             break
Beispiel #12
0
def project_delete(project_id):
    project = Project.find_project_by_id(project_id)
    agent.delete_project(project)
    try:
        db.session.delete(project)
        db.session.commit()
    except:
        db.session.rollback()
        raise
    return redirect("/project/manage", code=302)
Beispiel #13
0
 def cancel_spider(self, job_execution):
     job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
     project = Project.find_project_by_id(job_instance.project_id)
     # TODO multi service
     for spider_service_instance in self.spider_service_instances:
         if spider_service_instance.server == job_execution.running_on:
             if spider_service_instance.cancel_spider(project.project_name, job_execution.service_job_execution_id):
                 job_execution.running_status = SpiderStatus.CANCELED
                 db.session.commit()
             break
Beispiel #14
0
def get_all_spiders_info():
    """
    功能: 返回所有的爬虫的相关信息
    :return: 返回数据格式: json, 样式如下:
    {
      "last_run_status": "success",
      "last_run_time": "2018-09-19 03:30:45",
      "project_alias": "深圳市共享开放平台",
      "project_id": 1,
      "project_name": "opendata_sz",
      "spider_alias": "所有数据",
      "spider_id": 1,
      "spider_name": "list"
    }
    """
    # 保存数据的临时列表
    data = []
    # 遍历实例数据库, 获取爬虫信息
    job_instance = JobInstance.query.order_by(
        JobInstance.date_created).group_by(JobInstance.project_id).all()
    for spider in job_instance:
        # 得到实例的字典信息
        _temp = spider.to_dict()
        # 依据工程id查找Project数据库, 获取工程名以及备注信息
        project_base_info = Project.find_project_by_id(_temp['project_id'])
        instance_to_job_execution = JobExecution.query.filter_by(
            job_instance_id=_temp['job_instance_id']).all()
        if instance_to_job_execution:
            # 获取状态信息
            _status = instance_to_job_execution[-1].running_status
            # 状态信息格式转变
            status = switcher.get(_status, "CANCELED")
            # 获取实例的上一次运行时间
            last_run_time = instance_to_job_execution[-1].end_time
            service_job_execution_id = instance_to_job_execution[
                -1].service_job_execution_id
        else:
            status = 'PENDING'
            last_run_time = None
            service_job_execution_id = None
        # 将信息封装成字典格式
        _dict = dict(project_id=_temp['project_id'],
                     project_name=project_base_info.project_name,
                     project_alias=project_base_info.project_alias,
                     spider_id=SpiderInstance.query.filter_by(
                         project_id=_temp['project_id']).first().id,
                     spider_name=_temp['spider_name'],
                     spider_alias=_temp['desc'],
                     last_run_status=status,
                     last_run_time=str(last_run_time).split('.')[0],
                     run_type=_temp['run_type'],
                     job_exec_id=service_job_execution_id,
                     is_msd=project_base_info.is_msd)
        data.append(_dict)
    return json.dumps({"code": 200, 'data': data})
Beispiel #15
0
def inject_project():
    project_context = {}
    project_context['project_list'] = Project.query.all()
    if project_context['project_list'] and (not session.get('project_id')):
        project = Project.query.first()
        session['project_id'] = project.id
    if session.get('project_id'):
        project_context['project'] = Project.find_project_by_id(session['project_id'])
        project_context['spider_list'] = [spider_instance.to_dict() for spider_instance in
                                          SpiderInstance.query.filter_by(project_id=session['project_id']).all()]
    else:
        project_context['project'] = {}
    return project_context
Beispiel #16
0
 def cancel_spider(self, job_execution):
     job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
     project = Project.find_project_by_id(job_instance.project_id)
     for spider_service_instance in self.spider_service_instances:
         if spider_service_instance.server == job_execution.running_on:
             if spider_service_instance.cancel_spider(project.project_name, job_execution.service_job_execution_id):
                 job_execution.end_time = datetime.datetime.now()
                 job_execution.running_status = SpiderStatus.CANCELED
                 try:
                     db.session.commit()
                 except:
                     db.session.rollback()
                     raise
             break
Beispiel #17
0
 def start_spider(self, job_instance):
     project = Project.find_project_by_id(job_instance.project_id)
     # job_execution = JobExecution.find_job_by_service_id(job_instance.project_id)
     spider_name = job_instance.spider_name
     arguments = {}
     if job_instance.spider_arguments:
         arguments = dict(
             map(lambda x: x.split("="),
                 job_instance.spider_arguments.split(",")))
     threshold = 0
     daemon_size = len(self.spider_service_instances)
     if job_instance.priority == JobPriority.HIGH:
         threshold = int(daemon_size / 2)
     if job_instance.priority == JobPriority.HIGHEST:
         threshold = int(daemon_size)
     threshold = 1 if threshold == 0 else threshold
     candidates = self.spider_service_instances
     leaders = []
     if 'daemon' in arguments:
         for candidate in candidates:
             if candidate.server == arguments['daemon']:
                 leaders = [candidate]
     else:
         # TODO optimize some better func to vote the leader
         for i in range(threshold):
             leaders.append(random.choice(candidates))
     for leader in leaders:
         # add more arguments to scrapyd to run a spider
         arguments['project_id'] = job_instance.project_id
         arguments['project_name'] = project.project_name
         arguments['job_instance_id'] = job_instance.job_instance_id
         arguments['priority'] = job_instance.priority
         arguments['args'] = job_instance.spider_arguments
         arguments['execute_ip'] = leader.server
         arguments['create_time'] = datetime.datetime.now()
         serviec_job_id = leader.start_spider(project.project_name,
                                              spider_name, arguments)
         job_execution = JobExecution()
         job_execution.project_id = job_instance.project_id
         job_execution.service_job_execution_id = serviec_job_id
         job_execution.job_instance_id = job_instance.job_instance_id
         job_execution.create_time = arguments['create_time']
         job_execution.running_on = leader.server
         db.session.add(job_execution)
         db.session.commit()
Beispiel #18
0
def spider_egg_upload(project_id):
    project = Project.find_project_by_id(project_id)
    if 'file' not in request.files:
        flash('No file part')
        return redirect(request.referrer)
    file = request.files['file']
    # if user does not select file, browser also
    # submit a empty part without filename
    if file.filename == '':
        flash('No selected file')
        return redirect(request.referrer)
    if file:
        filename = secure_filename(file.filename)
        dst = os.path.join(tempfile.gettempdir(), filename)
        file.save(dst)
        agent.deploy(project, dst)
        flash('deploy success!')
    return redirect(request.referrer)
Beispiel #19
0
def clear_jobexecution(job_execution):
    """
    clear_jobexecution
    check JobExecution still existed on scrapyd servers
    delete it if didn't existed anymore.

    :param job_execution:
    :return:
    """
    job_instance = JobInstance.find_job_instance_by_id(
        job_execution.job_instance_id)
    project = Project.find_project_by_id(job_instance.project_id)
    if not check_job_existed(running_on=job_execution.running_on,
                             project_name=project.project_name,
                             spider_name=job_instance.spider_name,
                             job_id=job_execution.service_job_execution_id):
        db.session.delete(job_execution)
        db.session.commit()
Beispiel #20
0
 def start_spider(self, job_instance):
     project = Project.find_project_by_id(job_instance.project_id)
     spider_name = job_instance.spider_name
     #arguments = {}
     #if job_instance.spider_arguments:
     #    arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(",")))
     from collections import defaultdict
     arguments = defaultdict(list)
     if job_instance.spider_arguments:
         for k, v in list(map(lambda x: x.split('=', 1), job_instance.spider_arguments.split(','))):
             arguments[k].append(v)
     # threshold = 0
     # daemon_size = len(self.spider_service_instances)
     # if job_instance.priority == JobPriority.HIGH:
     #     threshold = int(daemon_size / 2)
     # if job_instance.priority == JobPriority.HIGHEST:
     #     threshold = int(daemon_size)
     # threshold = 1 if threshold == 0 else threshold
     threshold = 1
     candidates = self.spider_service_instances
     leaders = []
     if 'daemon' in arguments:
         for candidate in candidates:
             if candidate.server == arguments['daemon']:
                 leaders = [candidate]
     else:
         # TODO optimize some better func to vote the leader
         for i in range(threshold):
             leaders.append(random.choice(candidates))
     for leader in leaders:
         serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments)
         job_execution = JobExecution()
         job_execution.project_id = job_instance.project_id
         job_execution.service_job_execution_id = serviec_job_id
         job_execution.job_instance_id = job_instance.id
         job_execution.create_time = datetime.datetime.now()
         job_execution.running_on = leader.server
         try:
             db.session.add(job_execution)
             db.session.commit()
         except:
             db.session.rollback()
             raise
 def log_url_master(self, job_execution):
     """
     功能: 获取主爬虫的日志
     :param job_execution: job_execution对象
     :return: 返回log的url
     """
     job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
     project = Project.find_project_by_id(job_instance.project_id)
     # 主从爬虫运行的服务器字符串
     service_job_execution_id = job_execution.service_job_execution_id.split('>')
     # 主爬虫服务器
     master_service_job_execution_id = service_job_execution_id[0]
     # 爬虫运行的服务器地址
     running_on = job_execution.running_on.split('>')
     master_running_on = running_on[0]
     # 调用主爬虫的日志
     for spider_service_instance in self.spider_service_instances_master:
         if spider_service_instance.server == master_running_on:
             master_log_url = spider_service_instance.log_url(
                 project.project_name, job_instance.spider_name,
                 master_service_job_execution_id)
             return master_log_url
Beispiel #22
0
 def start_spider(self, job_instance):
     project = Project.find_project_by_id(job_instance.project_id)
     spider_name = job_instance.spider_name
     #arguments = {}
     #if job_instance.spider_arguments:
     #    arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(",")))
     from collections import defaultdict
     arguments = defaultdict(list)
     if job_instance.spider_arguments:
         for k, v in list(map(lambda x: x.split('=', 1), job_instance.spider_arguments.split(','))):
             arguments[k].append(v)
     threshold = 0
     daemon_size = len(self.spider_service_instances)
     if job_instance.priority == JobPriority.HIGH:
         threshold = int(daemon_size / 2)
     if job_instance.priority == JobPriority.HIGHEST:
         threshold = int(daemon_size)
     threshold = 1 if threshold == 0 else threshold
     candidates = self.spider_service_instances
     leaders = []
     if 'daemon' in arguments:
         for candidate in candidates:
             if candidate.server == arguments['daemon']:
                 leaders = [candidate]
     else:
         # TODO optimize some better func to vote the leader
         for i in range(threshold):
             leaders.append(random.choice(candidates))
     for leader in leaders:
         serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments)
         job_execution = JobExecution()
         job_execution.project_id = job_instance.project_id
         job_execution.service_job_execution_id = serviec_job_id
         job_execution.job_instance_id = job_instance.id
         job_execution.create_time = datetime.datetime.now()
         job_execution.running_on = leader.server
         db.session.add(job_execution)
         db.session.commit()
Beispiel #23
0
def job_add(project_id):
    project = Project.find_project_by_id(project_id)
    job_instance = JobInstance()
    job_instance.spider_name = request.form['spider_name']
    job_instance.project_id = project_id
    job_instance.spider_arguments = request.form['spider_arguments']
    job_instance.priority = request.form.get('priority', 0)
    job_instance.run_type = request.form['run_type']
    if job_instance.run_type == JobRunType.ONETIME:
        job_instance.enabled = -1
        db.session.add(job_instance)
        db.session.commit()
        agent.start_spider(job_instance)
    if job_instance.run_type == JobRunType.PERIODIC:
        job_instance.cron_minutes = request.form.get('cron_minutes') or '0'
        job_instance.cron_hour = request.form.get('cron_hour') or '*'
        job_instance.cron_day_of_month = request.form.get(
            'cron_day_of_month') or '*'
        job_instance.cron_day_of_week = request.form.get(
            'cron_day_of_week') or '*'
        job_instance.cron_month = request.form.get('cron_month') or '*'
        db.session.add(job_instance)
        db.session.commit()
    return redirect(request.referrer, code=302)
def job_add(project_id):
    project = Project.find_project_by_id(project_id)
    job_instance = JobInstance()
    job_instance.spider_name = request.form['spider_name']
    job_instance.project_id = project_id
    job_instance.spider_arguments = request.form['spider_arguments']
    job_instance.priority = request.form.get('priority', 0)
    job_instance.run_type = request.form['run_type']
    spider_url = request.form['spider_url']
    spider_models = request.form['spider_models']
    if job_instance.spider_name == "news":
        allowed_domains = spider_url.split('/')[2]
        a = "allowed_domains=" + allowed_domains
        a = a + "," + "model=" + spider_models
        job_instance.spider_arguments = a
        r = CRedis()
        r.lpush(allowed_domains + ':start_urls', spider_url)
    elif job_instance.spider_name == 'jd':
        r = CRedis()
        r.lpush('jd:start_urls', spider_url)
    if job_instance.run_type == JobRunType.ONETIME:
        job_instance.enabled = -1
        db.session.add(job_instance)
        db.session.commit()
        agent.start_spider(job_instance)
    if job_instance.run_type == JobRunType.PERIODIC:
        job_instance.cron_minutes = request.form.get('cron_minutes') or '0'
        job_instance.cron_hour = request.form.get('cron_hour') or '*'
        job_instance.cron_day_of_month = request.form.get(
            'cron_day_of_month') or '*'
        job_instance.cron_day_of_week = request.form.get(
            'cron_day_of_week') or '*'
        job_instance.cron_month = request.form.get('cron_month') or '*'
        db.session.add(job_instance)
        db.session.commit()
    return redirect(request.referrer, code=302)
def spider_deploy(project_id):
    project = Project.find_project_by_id(project_id)
    spider_instance_list = agent.get_spider_list(project)
    SpiderInstance.update_spider_instances(spider_instance_list)
    return render_template("spider_deploy.html")
Beispiel #26
0
def service_stats(project_id):
    project = Project.find_project_by_id(project_id)
    run_stats = JobExecution.list_run_stats_by_hours(project_id)
    return render_template("server_stats.html", run_stats=run_stats)
 def get(self, project_id):
     project = Project.find_project_by_id(project_id)
     return agent.get_spider_list(project) if project else []
Beispiel #28
0
def spider_deploy(project_id):
    project = Project.find_project_by_id(project_id)
    return render_template("spider_deploy.html")
Beispiel #29
0
def project_stats(project_id):
    project = Project.find_project_by_id(project_id)
    # run_stats = JobExecution.list_run_stats_by_hours(project_id)
    run_stats = JobExecution.list_run_stats_by_days(project_id)
    return render_template("project_stats.html", run_stats=run_stats)
Beispiel #30
0
def project_delete(project_id):
    project = Project.find_project_by_id(project_id)
    agent.delete_project(project)
    db.session.delete(project)
    db.session.commit()
    return redirect("/project/manage", code=302)
 def start_spider(self, job_instance):
     """
     功能: 启动爬虫,首先启动从爬虫, 至少有一个从爬虫启动成功后启动主爬虫
     :param job_instance: job_instance对象
     :return: None
     """
     project = Project.find_project_by_id(job_instance.project_id)
     if project.is_msd == '0':  # 如果是单机爬虫
         spider_name = job_instance.spider_name
         for leader in self.spider_service_instances_master:
             serviec_job_id = leader.start_spider(project.project_name, spider_name)
             # 如果启动成功
             if serviec_job_id:
                 job_execution = JobExecution()
                 job_execution.project_id = job_instance.project_id
                 job_execution.service_job_execution_id = serviec_job_id + '>'
                 job_execution.job_instance_id = job_instance.id
                 job_execution.create_time = datetime.datetime.now()
                 job_execution.running_on = leader.server + '>'
                 db.session.add(job_execution)
                 db.session.commit()
                 break
     else:
         # 主爬虫名
         spider_name_master = job_instance.spider_name
         spider_instance = SpiderInstance.query.filter_by(
             project_id=job_instance.project_id, spider_name=spider_name_master).first()
         # 从爬虫名
         spider_name_slave = spider_instance.spider_name_slave
         # 启动从爬虫服务器启动成功标志
         slave_flag = False
         # 从爬虫的job执行列表
         serviec_job_id_slave = []
         # 从爬虫运行的服务器列表
         running_on_slave = []
         # 遍历从爬虫服务器
         for leader in self.spider_service_instances_slave:
             # 启动爬虫, 爬虫启动成功,返回id, 否则返回None
             serviec_job_id = leader.start_spider(project.project_name, spider_name_slave)
             # 如果启动成功
             if serviec_job_id:
                 # 标志为True
                 slave_flag = True
                 # job_id添加到列表, 为日志获取提供数据
                 serviec_job_id_slave.append(serviec_job_id)
                 # 运行的服务器添加到列表, 为日志获取提供数据
                 running_on_slave.append(leader.server)
         # 将列表转换为字符串
         serviec_job_id_slave_str = ','.join(serviec_job_id_slave)
         running_on_slave_str = ','.join(running_on_slave)
         # 从爬虫服务器至少有一个启动成功,则启动主爬虫服务器
         if slave_flag:
             for leader in self.spider_service_instances_master:
                 serviec_job_id = leader.start_spider(project.project_name, spider_name_master)
                 # 如果启动成功
                 if serviec_job_id:
                     job_execution = JobExecution()
                     job_execution.project_id = job_instance.project_id
                     job_execution.service_job_execution_id = serviec_job_id+'>'+serviec_job_id_slave_str
                     job_execution.job_instance_id = job_instance.id
                     job_execution.create_time = datetime.datetime.now()
                     job_execution.running_on = leader.server+'>'+running_on_slave_str
                     db.session.add(job_execution)
                     db.session.commit()
                     break
Beispiel #32
0
 def get(self, project_id):
     project = Project.find_project_by_id(project_id)
     return agent.get_job_status(project)
Beispiel #33
0
 def start_spider(self, job_instance):
     project = Project.find_project_by_id(job_instance.project_id)
     spider_name = job_instance.spider_name
     task_id = job_instance.id
     if job_instance.keywords is not None:
         keywords_list = job_instance.keywords.strip(',').split(',')
         for keywords in keywords_list:
             arguments = {}
             if job_instance.spider_arguments:
                 arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(",")))
             threshold = 0       # 阈值
             leaders = []
             arguments['keywords'] = keywords
             arguments['video_time_short'] = job_instance.video_time_short
             arguments['video_time_long'] = job_instance.video_time_long
             if job_instance.upload_time_type == '设定区间':                 # 任务运行周期内自动设定最优时间参数
                 arguments['startDate'] = dts2ts(job_instance.upload_time_start_date)
                 arguments['endDate'] = dts2ts(job_instance.upload_time_end_date)
             else:
                 arguments['startDate'] = int(time.time()) - 3600*24*job_instance.spider_freq - 3600*24
                 arguments['endDate'] = int(time.time())
             arguments['task_id'] = task_id   # 将任务id加入到爬虫
             daemon_size = len(self.spider_service_instances)
             if job_instance.priority == JobPriority.HIGH:
                 threshold = int(daemon_size / 2)
             if job_instance.priority == JobPriority.HIGHEST:
                 threshold = int(daemon_size)
             threshold = 1 if threshold == 0 else threshold
             candidates = self.spider_service_instances
             if 'daemon' in arguments:
                 for candidate in candidates:
                     if candidate.server == arguments['daemon']:
                         leaders = [candidate]
             else:
                 # TODO optimize some better func to vote the leader
                 for i in range(threshold):
                     leaders.append(random.choice(candidates))
             for leader in leaders:
                 print(project.project_name, spider_name, arguments)
                 serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments)
                 job_execution = JobExecution()
                 job_execution.project_id = job_instance.project_id
                 job_execution.service_job_execution_id = serviec_job_id
                 job_execution.job_instance_id = job_instance.id
                 job_execution.create_time = datetime.datetime.now()
                 job_execution.running_on = leader.server
                 db.session.add(job_execution)
                 db.session.commit()
     else:
         arguments = {}
         if job_instance.spider_arguments:
             arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(",")))
         threshold = 0  # 阈值
         leaders = []
         arguments['video_time_short'] = job_instance.video_time_short
         arguments['video_time_long'] = job_instance.video_time_long
         arguments['startDate'] = dts2ts(job_instance.upload_time_start_date)
         arguments['endDate'] = dts2ts(job_instance.upload_time_end_date)
         arguments['task_id'] = task_id  # 将任务id加入到爬虫
         daemon_size = len(self.spider_service_instances)
         if job_instance.priority == JobPriority.HIGH:
             threshold = int(daemon_size / 2)
         if job_instance.priority == JobPriority.HIGHEST:
             threshold = int(daemon_size)
         threshold = 1 if threshold == 0 else threshold
         candidates = self.spider_service_instances
         if 'daemon' in arguments:
             for candidate in candidates:
                 if candidate.server == arguments['daemon']:
                     leaders = [candidate]
         else:
             # TODO optimize some better func to vote the leader
             for i in range(threshold):
                 leaders.append(random.choice(candidates))
         for leader in leaders:
             print(project.project_name, spider_name, arguments)
             serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments)
             job_execution = JobExecution()
             job_execution.project_id = job_instance.project_id
             job_execution.service_job_execution_id = serviec_job_id
             job_execution.job_instance_id = job_instance.id
             job_execution.create_time = datetime.datetime.now()
             job_execution.running_on = leader.server
             db.session.add(job_execution)
             db.session.commit()