def project_create(): project_name = request.form['project_name'] project = Project() project.project_name = project_name db.session.add(project) db.session.commit() return redirect("/project/%s/spider/deploy" % project.id, code=302)
def post(self): project_name = request.form['project_name'] project = Project() project.project_name = project_name db.session.add(project) db.session.commit() return project.to_dict()
def get_project_list(self): data = request("get", self._scrapyd_url() + "/listprojects.json", return_type="json") result = [] if data: for project_name in data['projects']: project = Project() project.project_name = project_name result.append(project) return result
def get(self, project_id): project = Project.find_project_by_id(project_id) return [ spider_instance.to_dict() for spider_instance in SpiderInstance.query.filter_by( project_id=project_id).all() ]
def job_add(project_id): project = Project.find_project_by_id(project_id) job_instance = JobInstance() job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = request.form['spider_arguments'] job_instance.priority = request.form.get('priority', 0) job_instance.run_type = request.form['run_type'] # chose daemon manually if request.form['daemon'] != 'auto': spider_args = [] if request.form['spider_arguments']: spider_args = request.form['spider_arguments'].split(",") spider_args.append("daemon={}".format(request.form['daemon'])) job_instance.spider_arguments = ','.join(spider_args) if job_instance.run_type == JobRunType.ONETIME: job_instance.enabled = -1 db.session.add(job_instance) db.session.commit() agent.start_spider(job_instance) if job_instance.run_type == JobRunType.PERIODIC: job_instance.cron_minutes = request.form.get('cron_minutes') or '0' job_instance.cron_hour = request.form.get('cron_hour') or '*' job_instance.cron_day_of_month = request.form.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = request.form.get( 'cron_day_of_week') or '*' job_instance.cron_month = request.form.get('cron_month') or '*' # set cron exp manually if request.form.get('cron_exp'): job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_month, job_instance.cron_day_of_week = \ request.form['cron_exp'].split(' ') db.session.add(job_instance) db.session.commit() return redirect(request.referrer, code=302)
def log_url(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: return spider_service_instance.log_url(project.project_name, job_instance.spider_name, job_execution.service_job_execution_id)
def job_periodic(project_id): project = Project.find_project_by_id(project_id) job_instance_list = [ job_instance.to_dict() for job_instance in JobInstance.query.filter_by( run_type="periodic", project_id=project_id).all() ] return render_template("job_periodic.html", job_instance_list=job_instance_list)
def cancel_spider(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: if spider_service_instance.cancel_spider(project.project_name, job_execution.service_job_execution_id): job_execution.end_time = datetime.datetime.now() job_execution.running_status = SpiderStatus.CANCELED db.session.commit() break
def inject_project(): project_context = {} project_context['project_list'] = Project.query.all() if project_context['project_list'] and (not session.get('project_id')): project = Project.query.first() session['project_id'] = project.id if session.get('project_id'): project_context['project'] = Project.find_project_by_id( session['project_id']) project_context['spider_list'] = [ spider_instance.to_dict() for spider_instance in SpiderInstance.query.filter_by( project_id=session['project_id']).all() ] else: project_context['project'] = {} return project_context
def spider_egg_upload(project_id): project = Project.find_project_by_id(project_id) if 'file' not in request.files: flash('No file part') return redirect(request.referrer) file = request.files['file'] # if user does not select file, browser also # submit a empty part without filename if file.filename == '': flash('No selected file') return redirect(request.referrer) if file: filename = secure_filename(file.filename) dst = os.path.join(tempfile.gettempdir(), filename) file.save(dst) agent.deploy(project, dst) flash('deploy success!') return redirect(request.referrer)
def start_spider(self, job_instance): project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name #arguments = {} #if job_instance.spider_arguments: # arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) from collections import defaultdict arguments = defaultdict(list) if job_instance.spider_arguments: for k, v in list(map(lambda x: x.split('=', 1).strip(), job_instance.spider_arguments.split(','))): arguments[k].append(v) threshold = 0 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances leaders = [] if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon'][0]: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) db.session.commit()
def project_stats(project_id, spider_id): if spider_id == "project": project = Project.find_project_by_id(project_id) spider = SpiderInstance.query.filter_by(project_id=project_id).all() working_time = JobExecution.list_working_time(project_id) last_run = JobExecution.list_last_run(project_id) quality_review = JobExecution.list_quality_review(project_id) last_ee = JobExecution.list_last_ee(project_id) run_stats = JobExecution.list_run_stats_by_hours(project_id) return render_template("project_stats.html", project=project, spider=spider, working_time=working_time, last_run=last_run, quality_review=quality_review, last_ee=last_ee, run_stats=run_stats) elif spider_id == "server": project = Project.find_project_by_id(project_id) run_stats = JobExecution.list_run_stats_by_hours(project_id) request_stats = JobExecution.list_request_stats_by_hours( project_id, spider_id) item_stats = JobExecution.list_item_stats_by_hours( project_id, spider_id) return render_template("server_stats.html", run_stats=run_stats) else: project = Project.find_project_by_id(project_id) spider = SpiderInstance.query.filter_by(project_id=project_id, id=spider_id).first() results = JobExecution.list_spider_stats(project_id, spider_id) start_time = [] end_time = [] end_time_short = [] duration_time = [] requests_count = [] items_count = [] items_cached = [] warnings_count = [] errors_count = [] bytes_count = [] retries_count = [] exceptions_count = [] exceptions_size = [] cache_size_count = [] cache_object_count = [] last_start_time = "" last_items_count = "" old_items_count = [] # Display date trick for small charts displayDates = False displayedDates = [] for i in range(0, len(results)): if (results[i]['end_time'] != None) and (results[i]['end_time'].split(" ")[0] not in displayedDates): displayedDates.append(results[i]['end_time'].split(" ")[0]) if len(displayedDates) > 2: displayDates = True # remove last JobInstance if not started or not finished if (len(results) > 0) and ((results[-1]['start_time'] == None) or (results[-1]['end_time'] == None)): results.pop() for i in range(0, len(results)): if i == len(results) - 1: last_start_time = results[i]['start_time'] last_items_count = results[i]['items_count'] else: old_items_count.append(results[i]['items_count']) start_time.append(results[i]['start_time']) end_time.append(results[i]['end_time']) duration_time.append((datetime.datetime.strptime( results[i]['end_time'], '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime( results[i]['start_time'], '%Y-%m-%d %H:%M:%S')).total_seconds()) if displayDates: end_time_short.append(end_time[-1].split(" ")[0]) else: end_time_short.append(end_time[-1].split(" ")[1]) requests_count.append(results[i]['requests_count']) items_count.append(results[i]['items_count']) if results[i]['items_count'] != 0: if results[i]['items_count'] - results[i][ 'requests_count'] >= 0: items_cached.append(results[i]['items_count'] - results[i]['requests_count']) else: items_cached.append(0) else: items_cached.append(0) warnings_count.append(results[i]['warnings_count']) errors_count.append(results[i]['errors_count']) bytes_count.append(results[i]['bytes_count']) retries_count.append(results[i]['retries_count']) exceptions_count.append(results[i]['exceptions_count']) if results[i]['exceptions_count'] > 10: exceptions_size.append(30) else: exceptions_size.append(results[i]['exceptions_count'] * 3) cache_size_count.append(results[i]['cache_size_count']) cache_object_count.append(results[i]['cache_object_count']) # tricks to have a nice gauge if len(results) == 0: min_items_count = 0 max_items_count = 100 average_items_count = 50 else: items_not_null = [] for i in old_items_count: if i != 0: items_not_null.append(i) if len(items_not_null) == 0: items_not_null = [0] min_items_count = min(items_not_null) if len(old_items_count) == 0: max_items_count = last_items_count else: max_items_count = max(old_items_count) average_items_count = sum(items_not_null) / len(items_not_null) if (min_items_count / max_items_count) > 0.8: min_items_count = max_items_count * 0.8 if (average_items_count / max_items_count ) > 0.95 or max_items_count == last_items_count: max_items_count = average_items_count * 1.05 return render_template("spider_stats.html", spider=spider, start_time=start_time, end_time=end_time, end_time_short=end_time_short, duration_time=duration_time, last_start_time=last_start_time, last_items_count=last_items_count, average_items_count=average_items_count, min_items_count=min_items_count, max_items_count=max_items_count, requests_count=requests_count, items_count=items_count, items_cached=items_cached, warnings_count=warnings_count, errors_count=errors_count, bytes_count=bytes_count, retries_count=retries_count, exceptions_count=exceptions_count, exceptions_size=exceptions_size, cache_size_count=cache_size_count, cache_object_count=cache_object_count)
def spider_deploy(project_id): project = Project.find_project_by_id(project_id) return render_template("spider_deploy.html")
def project_delete(project_id): project = Project.find_project_by_id(project_id) agent.delete_project(project) db.session.delete(project) db.session.commit() return redirect("/project/manage", code=302)
def get_project_list(self): project_list = self.spider_service_instances[0].get_project_list() Project.load_project(project_list) return [project.to_dict() for project in Project.query.all()]