Beispiel #1
0
def project_create():
    project_name = request.form['project_name']
    project = Project()
    project.project_name = project_name
    db.session.add(project)
    db.session.commit()
    return redirect("/project/%s/spider/deploy" % project.id, code=302)
Beispiel #2
0
 def post(self):
     project_name = request.form['project_name']
     project = Project()
     project.project_name = project_name
     db.session.add(project)
     db.session.commit()
     return project.to_dict()
Beispiel #3
0
 def get_project_list(self):
     data = request("get",
                    self._scrapyd_url() + "/listprojects.json",
                    return_type="json")
     result = []
     if data:
         for project_name in data['projects']:
             project = Project()
             project.project_name = project_name
             result.append(project)
     return result
Beispiel #4
0
 def get(self, project_id):
     project = Project.find_project_by_id(project_id)
     return [
         spider_instance.to_dict()
         for spider_instance in SpiderInstance.query.filter_by(
             project_id=project_id).all()
     ]
Beispiel #5
0
def job_add(project_id):
    project = Project.find_project_by_id(project_id)
    job_instance = JobInstance()
    job_instance.spider_name = request.form['spider_name']
    job_instance.project_id = project_id
    job_instance.spider_arguments = request.form['spider_arguments']
    job_instance.priority = request.form.get('priority', 0)
    job_instance.run_type = request.form['run_type']
    # chose daemon manually
    if request.form['daemon'] != 'auto':
        spider_args = []
        if request.form['spider_arguments']:
            spider_args = request.form['spider_arguments'].split(",")
        spider_args.append("daemon={}".format(request.form['daemon']))
        job_instance.spider_arguments = ','.join(spider_args)
    if job_instance.run_type == JobRunType.ONETIME:
        job_instance.enabled = -1
        db.session.add(job_instance)
        db.session.commit()
        agent.start_spider(job_instance)
    if job_instance.run_type == JobRunType.PERIODIC:
        job_instance.cron_minutes = request.form.get('cron_minutes') or '0'
        job_instance.cron_hour = request.form.get('cron_hour') or '*'
        job_instance.cron_day_of_month = request.form.get(
            'cron_day_of_month') or '*'
        job_instance.cron_day_of_week = request.form.get(
            'cron_day_of_week') or '*'
        job_instance.cron_month = request.form.get('cron_month') or '*'
        # set cron exp manually
        if request.form.get('cron_exp'):
            job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_month, job_instance.cron_day_of_week = \
                request.form['cron_exp'].split(' ')
        db.session.add(job_instance)
        db.session.commit()
    return redirect(request.referrer, code=302)
Beispiel #6
0
 def log_url(self, job_execution):
     job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
     project = Project.find_project_by_id(job_instance.project_id)
     for spider_service_instance in self.spider_service_instances:
         if spider_service_instance.server == job_execution.running_on:
             return spider_service_instance.log_url(project.project_name, job_instance.spider_name,
                                                    job_execution.service_job_execution_id)
Beispiel #7
0
def job_periodic(project_id):
    project = Project.find_project_by_id(project_id)
    job_instance_list = [
        job_instance.to_dict() for job_instance in JobInstance.query.filter_by(
            run_type="periodic", project_id=project_id).all()
    ]
    return render_template("job_periodic.html",
                           job_instance_list=job_instance_list)
Beispiel #8
0
 def cancel_spider(self, job_execution):
     job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
     project = Project.find_project_by_id(job_instance.project_id)
     for spider_service_instance in self.spider_service_instances:
         if spider_service_instance.server == job_execution.running_on:
             if spider_service_instance.cancel_spider(project.project_name, job_execution.service_job_execution_id):
                 job_execution.end_time = datetime.datetime.now()
                 job_execution.running_status = SpiderStatus.CANCELED
                 db.session.commit()
             break
Beispiel #9
0
def inject_project():
    project_context = {}
    project_context['project_list'] = Project.query.all()
    if project_context['project_list'] and (not session.get('project_id')):
        project = Project.query.first()
        session['project_id'] = project.id
    if session.get('project_id'):
        project_context['project'] = Project.find_project_by_id(
            session['project_id'])
        project_context['spider_list'] = [
            spider_instance.to_dict()
            for spider_instance in SpiderInstance.query.filter_by(
                project_id=session['project_id']).all()
        ]
    else:
        project_context['project'] = {}
    return project_context
Beispiel #10
0
def spider_egg_upload(project_id):
    project = Project.find_project_by_id(project_id)
    if 'file' not in request.files:
        flash('No file part')
        return redirect(request.referrer)
    file = request.files['file']
    # if user does not select file, browser also
    # submit a empty part without filename
    if file.filename == '':
        flash('No selected file')
        return redirect(request.referrer)
    if file:
        filename = secure_filename(file.filename)
        dst = os.path.join(tempfile.gettempdir(), filename)
        file.save(dst)
        agent.deploy(project, dst)
        flash('deploy success!')
    return redirect(request.referrer)
Beispiel #11
0
 def start_spider(self, job_instance):
     project = Project.find_project_by_id(job_instance.project_id)
     spider_name = job_instance.spider_name
     #arguments = {}
     #if job_instance.spider_arguments:
     #    arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(",")))
     from collections import defaultdict
     arguments = defaultdict(list)
     if job_instance.spider_arguments:
         for k, v in list(map(lambda x: x.split('=', 1).strip(), job_instance.spider_arguments.split(','))):
             arguments[k].append(v)
     threshold = 0
     daemon_size = len(self.spider_service_instances)
     if job_instance.priority == JobPriority.HIGH:
         threshold = int(daemon_size / 2)
     if job_instance.priority == JobPriority.HIGHEST:
         threshold = int(daemon_size)
     threshold = 1 if threshold == 0 else threshold
     candidates = self.spider_service_instances
     leaders = []
     if 'daemon' in arguments:
         for candidate in candidates:
             if candidate.server == arguments['daemon'][0]:
                 leaders = [candidate]
     else:
         # TODO optimize some better func to vote the leader
         for i in range(threshold):
             leaders.append(random.choice(candidates))
     for leader in leaders:
         serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments)
         job_execution = JobExecution()
         job_execution.project_id = job_instance.project_id
         job_execution.service_job_execution_id = serviec_job_id
         job_execution.job_instance_id = job_instance.id
         job_execution.create_time = datetime.datetime.now()
         job_execution.running_on = leader.server
         db.session.add(job_execution)
         db.session.commit()
Beispiel #12
0
def project_stats(project_id, spider_id):
    if spider_id == "project":
        project = Project.find_project_by_id(project_id)
        spider = SpiderInstance.query.filter_by(project_id=project_id).all()
        working_time = JobExecution.list_working_time(project_id)
        last_run = JobExecution.list_last_run(project_id)
        quality_review = JobExecution.list_quality_review(project_id)
        last_ee = JobExecution.list_last_ee(project_id)
        run_stats = JobExecution.list_run_stats_by_hours(project_id)
        return render_template("project_stats.html",
                               project=project,
                               spider=spider,
                               working_time=working_time,
                               last_run=last_run,
                               quality_review=quality_review,
                               last_ee=last_ee,
                               run_stats=run_stats)

    elif spider_id == "server":
        project = Project.find_project_by_id(project_id)
        run_stats = JobExecution.list_run_stats_by_hours(project_id)
        request_stats = JobExecution.list_request_stats_by_hours(
            project_id, spider_id)
        item_stats = JobExecution.list_item_stats_by_hours(
            project_id, spider_id)
        return render_template("server_stats.html", run_stats=run_stats)

    else:
        project = Project.find_project_by_id(project_id)
        spider = SpiderInstance.query.filter_by(project_id=project_id,
                                                id=spider_id).first()
        results = JobExecution.list_spider_stats(project_id, spider_id)

        start_time = []
        end_time = []
        end_time_short = []
        duration_time = []
        requests_count = []
        items_count = []
        items_cached = []
        warnings_count = []
        errors_count = []
        bytes_count = []
        retries_count = []
        exceptions_count = []
        exceptions_size = []
        cache_size_count = []
        cache_object_count = []
        last_start_time = ""
        last_items_count = ""
        old_items_count = []

        # Display date trick for small charts
        displayDates = False
        displayedDates = []
        for i in range(0, len(results)):
            if (results[i]['end_time'] !=
                    None) and (results[i]['end_time'].split(" ")[0]
                               not in displayedDates):
                displayedDates.append(results[i]['end_time'].split(" ")[0])
        if len(displayedDates) > 2:
            displayDates = True

        # remove last JobInstance if not started or not finished
        if (len(results) > 0) and ((results[-1]['start_time'] == None) or
                                   (results[-1]['end_time'] == None)):
            results.pop()

        for i in range(0, len(results)):
            if i == len(results) - 1:
                last_start_time = results[i]['start_time']
                last_items_count = results[i]['items_count']
            else:
                old_items_count.append(results[i]['items_count'])

            start_time.append(results[i]['start_time'])
            end_time.append(results[i]['end_time'])
            duration_time.append((datetime.datetime.strptime(
                results[i]['end_time'], '%Y-%m-%d %H:%M:%S') -
                                  datetime.datetime.strptime(
                                      results[i]['start_time'],
                                      '%Y-%m-%d %H:%M:%S')).total_seconds())

            if displayDates:
                end_time_short.append(end_time[-1].split(" ")[0])
            else:
                end_time_short.append(end_time[-1].split(" ")[1])

            requests_count.append(results[i]['requests_count'])
            items_count.append(results[i]['items_count'])
            if results[i]['items_count'] != 0:
                if results[i]['items_count'] - results[i][
                        'requests_count'] >= 0:
                    items_cached.append(results[i]['items_count'] -
                                        results[i]['requests_count'])
                else:
                    items_cached.append(0)
            else:
                items_cached.append(0)
            warnings_count.append(results[i]['warnings_count'])
            errors_count.append(results[i]['errors_count'])
            bytes_count.append(results[i]['bytes_count'])
            retries_count.append(results[i]['retries_count'])

            exceptions_count.append(results[i]['exceptions_count'])
            if results[i]['exceptions_count'] > 10:
                exceptions_size.append(30)
            else:
                exceptions_size.append(results[i]['exceptions_count'] * 3)

            cache_size_count.append(results[i]['cache_size_count'])
            cache_object_count.append(results[i]['cache_object_count'])

        # tricks to have a nice gauge
        if len(results) == 0:
            min_items_count = 0
            max_items_count = 100
            average_items_count = 50
        else:
            items_not_null = []
            for i in old_items_count:
                if i != 0:
                    items_not_null.append(i)
            if len(items_not_null) == 0: items_not_null = [0]
            min_items_count = min(items_not_null)
            if len(old_items_count) == 0: max_items_count = last_items_count
            else: max_items_count = max(old_items_count)
            average_items_count = sum(items_not_null) / len(items_not_null)
            if (min_items_count / max_items_count) > 0.8:
                min_items_count = max_items_count * 0.8
            if (average_items_count / max_items_count
                ) > 0.95 or max_items_count == last_items_count:
                max_items_count = average_items_count * 1.05

        return render_template("spider_stats.html",
                               spider=spider,
                               start_time=start_time,
                               end_time=end_time,
                               end_time_short=end_time_short,
                               duration_time=duration_time,
                               last_start_time=last_start_time,
                               last_items_count=last_items_count,
                               average_items_count=average_items_count,
                               min_items_count=min_items_count,
                               max_items_count=max_items_count,
                               requests_count=requests_count,
                               items_count=items_count,
                               items_cached=items_cached,
                               warnings_count=warnings_count,
                               errors_count=errors_count,
                               bytes_count=bytes_count,
                               retries_count=retries_count,
                               exceptions_count=exceptions_count,
                               exceptions_size=exceptions_size,
                               cache_size_count=cache_size_count,
                               cache_object_count=cache_object_count)
Beispiel #13
0
def spider_deploy(project_id):
    project = Project.find_project_by_id(project_id)
    return render_template("spider_deploy.html")
Beispiel #14
0
def project_delete(project_id):
    project = Project.find_project_by_id(project_id)
    agent.delete_project(project)
    db.session.delete(project)
    db.session.commit()
    return redirect("/project/manage", code=302)
Beispiel #15
0
 def get_project_list(self):
     project_list = self.spider_service_instances[0].get_project_list()
     Project.load_project(project_list)
     return [project.to_dict() for project in Project.query.all()]