def start_spider(self, job_instance): project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name #arguments = {} #if job_instance.spider_arguments: # arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) from collections import defaultdict arguments = defaultdict(list) if job_instance.spider_arguments: for k, v in list( map(lambda x: [y.strip() for y in x.split('=', 1)], job_instance.spider_arguments.split(','))): arguments[k].append(v) threshold = 0 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances leaders = [] if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon'][0]: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server try: db.session.add(job_execution) db.session.commit() except: db.session.rollback() raise
def run_back_in_time(self, job_instance): # prevent jobs overlapping for the same spider if not job_instance.overlapping and self._spider_already_running(job_instance.spider_name, job_instance.project_id): return project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name from collections import defaultdict arguments = defaultdict(list) if job_instance.spider_arguments: for k, v in list(map(lambda x: x.strip().split('=', 1), job_instance.spider_arguments.split(','))): arguments[k].append(v) threshold = 0 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances leaders = [] if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon'][0]: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: service_job_id = leader.back_in_time(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = service_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) try: db.session.commit() except Exception as e: db.session.rollback() raise e
def start_spider(self, job_instance): # prevent jobs overlapping for the same spider if not job_instance.overlapping and self._spider_already_running(job_instance.spider_name, job_instance.project_id): return project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name # arguments = {} # if job_instance.spider_arguments: # arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) from collections import defaultdict arguments = defaultdict(list) if job_instance.spider_arguments: for k, v in list(map(lambda x: x.strip().split('=', 1), job_instance.spider_arguments.split(','))): arguments[k].append(v) threshold = 0 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances leaders = [] if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon'][0]: leaders = [candidate] elif not config.RUNS_IN_CLOUD: for candidate in candidates: leaders = [candidate] else: instance_ids = get_cluster_instances_ids(app) instance_stats = {} for i in instance_ids: ips = get_instances_private_ips(app, [i]) if len(ips) < 1: continue ip = ips.pop(0) instance_stats[ip] = get_instance_memory_usage(app, i) ip, _ = sorted(instance_stats.items(), key=lambda kv: kv[1] or 0).pop(0) # TODO optimize some better func to vote the leader for i in range(threshold): for candidate in candidates: if ip in candidate.server: leaders.append(candidate) for leader in leaders: service_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = service_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) try: db.session.commit() except Exception as e: db.session.rollback() raise e