def sync_job_status(self, project): found_jobs = [] job_execution_list = JobExecution.list_uncomplete_job(project) job_execution_dict = dict( [(job_execution.service_job_execution_id, job_execution) for job_execution in job_execution_list]) for spider_service_instance in self.spider_service_instances: job_status = spider_service_instance.get_job_list(project.project_name) # pending pending_found_jobs = self._process_pending_jobs(job_status) running_found_jobs = self._process_running_jobs(job_status, job_execution_dict) finished_found_jobs = self.process_finished_jobs(job_status, job_execution_dict) found_jobs += pending_found_jobs found_jobs += running_found_jobs found_jobs += finished_found_jobs # mark jobs as CRASHED for job_execution in job_execution_list: if job_execution.service_job_execution_id not in found_jobs: job_execution.running_status = SpiderStatus.CRASHED job_execution.end_time = datetime.datetime.now() # commit try: db.session.commit() except Exception as e: db.session.rollback() raise e
def job_favorites(project_id): unique_spiders = set() unique_favorite_jobs = list() spider_colours = {} if request.method == 'POST': favorite_spiders = list(filter(None, request.form['favorite'].split(','))) jobs = JobExecution.favorite_spiders_jobs(project_id, favorite_spiders, 5000) if favorite_spiders else [] # keep only one instance for every spider for job in jobs: instance = job['job_instance'] if instance.get('spider_name') not in unique_spiders: unique_favorite_jobs.append(job) unique_spiders.add(instance.get('spider_name')) for spider_name in unique_spiders: spider_id, old_items_count = JobExecution.get_last_execution_by_spider(spider_name, project_id) if not old_items_count: spider_colours[spider_name] = { 'colour': None, 'spider_id': spider_id } continue last_items_count = old_items_count.pop(0) (min_items_count, average_items_count, max_items_count) = _compute_item_stats(old_items_count, last_items_count) if 0 <= last_items_count <= min_items_count: colour = 'danger' elif min_items_count < last_items_count <= max_items_count: colour = 'success' else: colour = 'warning' spider_colours[spider_name] = { 'colour': colour, 'spider_id': spider_id } return render_template("job_favorites.html", job_status=unique_favorite_jobs, spider_colours=spider_colours, method=request.method)
def job_dashboard(project_id): jobs = JobExecution.list_jobs(project_id) unique_spiders = set() for job in jobs['COMPLETED']: instance = job['job_instance'] unique_spiders.add(instance.get('spider_name')) spider_colours = {} for spider_name in unique_spiders: spider_id, old_items_count = JobExecution.get_last_execution_by_spider(spider_name, project_id) if not old_items_count: spider_colours[spider_name] = { 'colour': None, 'spider_id': spider_id } continue last_items_count = old_items_count.pop(0) (min_items_count, average_items_count, max_items_count) = _compute_item_stats(old_items_count, last_items_count) if 0 <= last_items_count <= min_items_count: colour = 'danger' elif min_items_count < last_items_count <= max_items_count: colour = 'success' else: colour = 'warning' spider_colours[spider_name] = { 'colour': colour, 'spider_id': spider_id } return render_template("job_dashboard.html", job_status=jobs, spider_colours=spider_colours, bit_enabled=config.BACK_IN_TIME_ENABLED)
def start_spider(self, job_instance): project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name #arguments = {} #if job_instance.spider_arguments: # arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) from collections import defaultdict arguments = defaultdict(list) if job_instance.spider_arguments: for k, v in list( map(lambda x: [y.strip() for y in x.split('=', 1)], job_instance.spider_arguments.split(','))): arguments[k].append(v) threshold = 0 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances leaders = [] if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon'][0]: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server try: db.session.add(job_execution) db.session.commit() except: db.session.rollback() raise
def run_back_in_time(self, job_instance): # prevent jobs overlapping for the same spider if not job_instance.overlapping and self._spider_already_running(job_instance.spider_name, job_instance.project_id): return project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name from collections import defaultdict arguments = defaultdict(list) if job_instance.spider_arguments: for k, v in list(map(lambda x: x.strip().split('=', 1), job_instance.spider_arguments.split(','))): arguments[k].append(v) threshold = 0 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances leaders = [] if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon'][0]: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: service_job_id = leader.back_in_time(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = service_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) try: db.session.commit() except Exception as e: db.session.rollback() raise e
def sync_job_status(self, project): for spider_service_instance in self.spider_service_instances: job_status = spider_service_instance.get_job_list( project.project_name) job_execution_list = JobExecution.list_uncomplete_job() job_execution_dict = dict([ (job_execution.service_job_execution_id, job_execution) for job_execution in job_execution_list ]) # running for job_execution_info in job_status[SpiderStatus.RUNNING]: job_execution = job_execution_dict.get( job_execution_info['id']) if job_execution and job_execution.running_status == SpiderStatus.PENDING: job_execution.start_time = job_execution_info['start_time'] job_execution.running_status = SpiderStatus.RUNNING # finished for job_execution_info in job_status[SpiderStatus.FINISHED]: job_execution = job_execution_dict.get( job_execution_info['id']) if job_execution and job_execution.running_status != SpiderStatus.FINISHED: job_execution.start_time = job_execution_info['start_time'] job_execution.end_time = job_execution_info['end_time'] job_execution.running_status = SpiderStatus.FINISHED res = requests.get(self.log_url(job_execution)) res.encoding = 'utf8' raw = res.text[-4096:] match = re.findall(job_execution.RAW_STATS_REGEX, raw, re.DOTALL) if match: job_execution.raw_stats = match[0] job_execution.process_raw_stats() # commit try: db.session.commit() except: db.session.rollback() raise
def project_stats(project_id, spider_id): if spider_id == "project": project = Project.find_project_by_id(project_id) spider = SpiderInstance.query.filter_by(project_id=project_id).all() working_time = JobExecution.list_working_time(project_id) last_run = JobExecution.list_last_run(project_id) quality_review = JobExecution.list_quality_review(project_id) last_ee = JobExecution.list_last_ee(project_id) run_stats = JobExecution.list_run_stats_by_hours(project_id) return render_template("project_stats.html", project=project, spider=spider, working_time=working_time, last_run=last_run, quality_review=quality_review, last_ee=last_ee, run_stats=run_stats) elif spider_id == "server": project = Project.find_project_by_id(project_id) run_stats = JobExecution.list_run_stats_by_hours(project_id) request_stats = JobExecution.list_request_stats_by_hours( project_id, spider_id) item_stats = JobExecution.list_item_stats_by_hours( project_id, spider_id) return render_template("server_stats.html", run_stats=run_stats) else: project = Project.find_project_by_id(project_id) spider = SpiderInstance.query.filter_by(project_id=project_id, id=spider_id).first() results = JobExecution.list_spider_stats(project_id, spider_id) start_time = [] end_time = [] end_time_short = [] duration_time = [] requests_count = [] items_count = [] items_cached = [] warnings_count = [] errors_count = [] bytes_count = [] retries_count = [] exceptions_count = [] exceptions_size = [] cache_size_count = [] cache_object_count = [] last_start_time = "" last_items_count = "" old_items_count = [] # Display date trick for small charts displayDates = False displayedDates = [] for i in range(0, len(results)): if (results[i]['end_time'] != None) and (results[i]['end_time'].split(" ")[0] not in displayedDates): displayedDates.append(results[i]['end_time'].split(" ")[0]) if len(displayedDates) > 2: displayDates = True # remove last JobInstance if not started or not finished if (len(results) > 0) and ((results[-1]['start_time'] == None) or (results[-1]['end_time'] == None)): results.pop() for i in range(0, len(results)): if i == len(results) - 1: last_start_time = results[i]['start_time'] last_items_count = results[i]['items_count'] else: old_items_count.append(results[i]['items_count']) start_time.append(results[i]['start_time']) end_time.append(results[i]['end_time']) duration_time.append((datetime.datetime.strptime( results[i]['end_time'], '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime( results[i]['start_time'], '%Y-%m-%d %H:%M:%S')).total_seconds()) if displayDates: end_time_short.append(end_time[-1].split(" ")[0]) else: end_time_short.append(end_time[-1].split(" ")[1]) requests_count.append(results[i]['requests_count']) items_count.append(results[i]['items_count']) if results[i]['items_count'] != 0: if results[i]['items_count'] - results[i][ 'requests_count'] >= 0: items_cached.append(results[i]['items_count'] - results[i]['requests_count']) else: items_cached.append(0) else: items_cached.append(0) warnings_count.append(results[i]['warnings_count']) errors_count.append(results[i]['errors_count']) bytes_count.append(results[i]['bytes_count']) retries_count.append(results[i]['retries_count']) exceptions_count.append(results[i]['exceptions_count']) if results[i]['exceptions_count'] > 10: exceptions_size.append(30) else: exceptions_size.append(results[i]['exceptions_count'] * 3) cache_size_count.append(results[i]['cache_size_count']) cache_object_count.append(results[i]['cache_object_count']) # tricks to have a nice gauge if len(results) == 0: min_items_count = 0 max_items_count = 100 average_items_count = 50 else: items_not_null = [] for i in old_items_count: if i != 0: items_not_null.append(i) if len(items_not_null) == 0: items_not_null = [0] min_items_count = min(items_not_null) if len(old_items_count) == 0: max_items_count = last_items_count else: max_items_count = max(old_items_count) average_items_count = sum(items_not_null) / len(items_not_null) if max_items_count == 0: min_items_count = 0 else: if (min_items_count / max_items_count) > 0.8: min_items_count = max_items_count * 0.8 if (average_items_count / max_items_count ) > 0.95 or max_items_count == last_items_count: max_items_count = average_items_count * 1.05 return render_template("spider_stats.html", spider=spider, start_time=start_time, end_time=end_time, end_time_short=end_time_short, duration_time=duration_time, last_start_time=last_start_time, last_items_count=last_items_count, average_items_count=average_items_count, min_items_count=min_items_count, max_items_count=max_items_count, requests_count=requests_count, items_count=items_count, items_cached=items_cached, warnings_count=warnings_count, errors_count=errors_count, bytes_count=bytes_count, retries_count=retries_count, exceptions_count=exceptions_count, exceptions_size=exceptions_size, cache_size_count=cache_size_count, cache_object_count=cache_object_count)
def job_dashboard(project_id): return render_template("job_dashboard.html", job_status=JobExecution.list_jobs(project_id))
def get(self, project_id): return JobExecution.list_jobs(project_id)
def start_spider(self, job_instance): # prevent jobs overlapping for the same spider if not job_instance.overlapping and self._spider_already_running(job_instance.spider_name, job_instance.project_id): return project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name # arguments = {} # if job_instance.spider_arguments: # arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) from collections import defaultdict arguments = defaultdict(list) if job_instance.spider_arguments: for k, v in list(map(lambda x: x.strip().split('=', 1), job_instance.spider_arguments.split(','))): arguments[k].append(v) threshold = 0 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances leaders = [] if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon'][0]: leaders = [candidate] elif not config.RUNS_IN_CLOUD: for candidate in candidates: leaders = [candidate] else: instance_ids = get_cluster_instances_ids(app) instance_stats = {} for i in instance_ids: ips = get_instances_private_ips(app, [i]) if len(ips) < 1: continue ip = ips.pop(0) instance_stats[ip] = get_instance_memory_usage(app, i) ip, _ = sorted(instance_stats.items(), key=lambda kv: kv[1] or 0).pop(0) # TODO optimize some better func to vote the leader for i in range(threshold): for candidate in candidates: if ip in candidate.server: leaders.append(candidate) for leader in leaders: service_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = service_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) try: db.session.commit() except Exception as e: db.session.rollback() raise e
def _spider_already_running(self, spider_name, project_id): running_jobs = JobExecution.get_running_jobs_by_spider_name(spider_name, project_id) return len(running_jobs) > 0