コード例 #1
0
    def sync_job_status(self, project):
        found_jobs = []

        job_execution_list = JobExecution.list_uncomplete_job(project)
        job_execution_dict = dict(
            [(job_execution.service_job_execution_id, job_execution) for job_execution in job_execution_list])

        for spider_service_instance in self.spider_service_instances:
            job_status = spider_service_instance.get_job_list(project.project_name)
            # pending

            pending_found_jobs = self._process_pending_jobs(job_status)
            running_found_jobs = self._process_running_jobs(job_status, job_execution_dict)
            finished_found_jobs = self.process_finished_jobs(job_status, job_execution_dict)

            found_jobs += pending_found_jobs
            found_jobs += running_found_jobs
            found_jobs += finished_found_jobs

        # mark jobs as CRASHED
        for job_execution in job_execution_list:
            if job_execution.service_job_execution_id not in found_jobs:
                job_execution.running_status = SpiderStatus.CRASHED
                job_execution.end_time = datetime.datetime.now()

        # commit
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            raise e
コード例 #2
0
ファイル: controller.py プロジェクト: divtiply/ScrapyKeeper
def job_favorites(project_id):
    unique_spiders = set()
    unique_favorite_jobs = list()
    spider_colours = {}

    if request.method == 'POST':
        favorite_spiders = list(filter(None, request.form['favorite'].split(',')))
        jobs = JobExecution.favorite_spiders_jobs(project_id, favorite_spiders, 5000) if favorite_spiders else []

        #  keep only one instance for every spider
        for job in jobs:
            instance = job['job_instance']
            if instance.get('spider_name') not in unique_spiders:
                unique_favorite_jobs.append(job)
            unique_spiders.add(instance.get('spider_name'))

        for spider_name in unique_spiders:
            spider_id, old_items_count = JobExecution.get_last_execution_by_spider(spider_name, project_id)
            if not old_items_count:
                spider_colours[spider_name] = {
                    'colour': None,
                    'spider_id': spider_id
                }
                continue

            last_items_count = old_items_count.pop(0)
            (min_items_count, average_items_count, max_items_count) = _compute_item_stats(old_items_count, last_items_count)

            if 0 <= last_items_count <= min_items_count:
                colour = 'danger'
            elif min_items_count < last_items_count <= max_items_count:
                colour = 'success'
            else:
                colour = 'warning'

            spider_colours[spider_name] = {
                'colour': colour,
                'spider_id': spider_id
            }

    return render_template("job_favorites.html", job_status=unique_favorite_jobs, spider_colours=spider_colours,
                           method=request.method)
コード例 #3
0
ファイル: controller.py プロジェクト: divtiply/ScrapyKeeper
def job_dashboard(project_id):
    jobs = JobExecution.list_jobs(project_id)
    unique_spiders = set()

    for job in jobs['COMPLETED']:
        instance = job['job_instance']
        unique_spiders.add(instance.get('spider_name'))

    spider_colours = {}

    for spider_name in unique_spiders:
        spider_id, old_items_count = JobExecution.get_last_execution_by_spider(spider_name, project_id)
        if not old_items_count:
            spider_colours[spider_name] = {
                'colour': None,
                'spider_id': spider_id
            }
            continue

        last_items_count = old_items_count.pop(0)
        (min_items_count, average_items_count, max_items_count) = _compute_item_stats(old_items_count, last_items_count)

        if 0 <= last_items_count <= min_items_count:
            colour = 'danger'
        elif min_items_count < last_items_count <= max_items_count:
            colour = 'success'
        else:
            colour = 'warning'

        spider_colours[spider_name] = {
            'colour': colour,
            'spider_id': spider_id
        }

    return render_template("job_dashboard.html", job_status=jobs, spider_colours=spider_colours,
                           bit_enabled=config.BACK_IN_TIME_ENABLED)
コード例 #4
0
 def start_spider(self, job_instance):
     project = Project.find_project_by_id(job_instance.project_id)
     spider_name = job_instance.spider_name
     #arguments = {}
     #if job_instance.spider_arguments:
     #    arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(",")))
     from collections import defaultdict
     arguments = defaultdict(list)
     if job_instance.spider_arguments:
         for k, v in list(
                 map(lambda x: [y.strip() for y in x.split('=', 1)],
                     job_instance.spider_arguments.split(','))):
             arguments[k].append(v)
     threshold = 0
     daemon_size = len(self.spider_service_instances)
     if job_instance.priority == JobPriority.HIGH:
         threshold = int(daemon_size / 2)
     if job_instance.priority == JobPriority.HIGHEST:
         threshold = int(daemon_size)
     threshold = 1 if threshold == 0 else threshold
     candidates = self.spider_service_instances
     leaders = []
     if 'daemon' in arguments:
         for candidate in candidates:
             if candidate.server == arguments['daemon'][0]:
                 leaders = [candidate]
     else:
         # TODO optimize some better func to vote the leader
         for i in range(threshold):
             leaders.append(random.choice(candidates))
     for leader in leaders:
         serviec_job_id = leader.start_spider(project.project_name,
                                              spider_name, arguments)
         job_execution = JobExecution()
         job_execution.project_id = job_instance.project_id
         job_execution.service_job_execution_id = serviec_job_id
         job_execution.job_instance_id = job_instance.id
         job_execution.create_time = datetime.datetime.now()
         job_execution.running_on = leader.server
         try:
             db.session.add(job_execution)
             db.session.commit()
         except:
             db.session.rollback()
             raise
コード例 #5
0
    def run_back_in_time(self, job_instance):
        # prevent jobs overlapping for the same spider
        if not job_instance.overlapping and self._spider_already_running(job_instance.spider_name,
                                                                         job_instance.project_id):
            return

        project = Project.find_project_by_id(job_instance.project_id)
        spider_name = job_instance.spider_name
        from collections import defaultdict
        arguments = defaultdict(list)
        if job_instance.spider_arguments:
            for k, v in list(map(lambda x: x.strip().split('=', 1), job_instance.spider_arguments.split(','))):
                arguments[k].append(v)
        threshold = 0
        daemon_size = len(self.spider_service_instances)
        if job_instance.priority == JobPriority.HIGH:
            threshold = int(daemon_size / 2)
        if job_instance.priority == JobPriority.HIGHEST:
            threshold = int(daemon_size)
        threshold = 1 if threshold == 0 else threshold
        candidates = self.spider_service_instances
        leaders = []
        if 'daemon' in arguments:
            for candidate in candidates:
                if candidate.server == arguments['daemon'][0]:
                    leaders = [candidate]
        else:
            # TODO optimize some better func to vote the leader
            for i in range(threshold):
                leaders.append(random.choice(candidates))
        for leader in leaders:
            service_job_id = leader.back_in_time(project.project_name, spider_name, arguments)
            job_execution = JobExecution()
            job_execution.project_id = job_instance.project_id
            job_execution.service_job_execution_id = service_job_id
            job_execution.job_instance_id = job_instance.id
            job_execution.create_time = datetime.datetime.now()
            job_execution.running_on = leader.server
            db.session.add(job_execution)
            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                raise e
コード例 #6
0
    def sync_job_status(self, project):
        for spider_service_instance in self.spider_service_instances:
            job_status = spider_service_instance.get_job_list(
                project.project_name)
            job_execution_list = JobExecution.list_uncomplete_job()
            job_execution_dict = dict([
                (job_execution.service_job_execution_id, job_execution)
                for job_execution in job_execution_list
            ])
            # running
            for job_execution_info in job_status[SpiderStatus.RUNNING]:
                job_execution = job_execution_dict.get(
                    job_execution_info['id'])
                if job_execution and job_execution.running_status == SpiderStatus.PENDING:
                    job_execution.start_time = job_execution_info['start_time']
                    job_execution.running_status = SpiderStatus.RUNNING

            # finished
            for job_execution_info in job_status[SpiderStatus.FINISHED]:
                job_execution = job_execution_dict.get(
                    job_execution_info['id'])
                if job_execution and job_execution.running_status != SpiderStatus.FINISHED:
                    job_execution.start_time = job_execution_info['start_time']
                    job_execution.end_time = job_execution_info['end_time']
                    job_execution.running_status = SpiderStatus.FINISHED

                    res = requests.get(self.log_url(job_execution))
                    res.encoding = 'utf8'
                    raw = res.text[-4096:]
                    match = re.findall(job_execution.RAW_STATS_REGEX, raw,
                                       re.DOTALL)
                    if match:
                        job_execution.raw_stats = match[0]
                        job_execution.process_raw_stats()
            # commit
            try:
                db.session.commit()
            except:
                db.session.rollback()
                raise
コード例 #7
0
ファイル: controller.py プロジェクト: bsekiewicz/ScrapyKeeper
def project_stats(project_id, spider_id):
    if spider_id == "project":
        project = Project.find_project_by_id(project_id)
        spider = SpiderInstance.query.filter_by(project_id=project_id).all()
        working_time = JobExecution.list_working_time(project_id)
        last_run = JobExecution.list_last_run(project_id)
        quality_review = JobExecution.list_quality_review(project_id)
        last_ee = JobExecution.list_last_ee(project_id)
        run_stats = JobExecution.list_run_stats_by_hours(project_id)
        return render_template("project_stats.html",
                               project=project,
                               spider=spider,
                               working_time=working_time,
                               last_run=last_run,
                               quality_review=quality_review,
                               last_ee=last_ee,
                               run_stats=run_stats)

    elif spider_id == "server":
        project = Project.find_project_by_id(project_id)
        run_stats = JobExecution.list_run_stats_by_hours(project_id)
        request_stats = JobExecution.list_request_stats_by_hours(
            project_id, spider_id)
        item_stats = JobExecution.list_item_stats_by_hours(
            project_id, spider_id)
        return render_template("server_stats.html", run_stats=run_stats)

    else:
        project = Project.find_project_by_id(project_id)
        spider = SpiderInstance.query.filter_by(project_id=project_id,
                                                id=spider_id).first()
        results = JobExecution.list_spider_stats(project_id, spider_id)

        start_time = []
        end_time = []
        end_time_short = []
        duration_time = []
        requests_count = []
        items_count = []
        items_cached = []
        warnings_count = []
        errors_count = []
        bytes_count = []
        retries_count = []
        exceptions_count = []
        exceptions_size = []
        cache_size_count = []
        cache_object_count = []
        last_start_time = ""
        last_items_count = ""
        old_items_count = []

        # Display date trick for small charts
        displayDates = False
        displayedDates = []
        for i in range(0, len(results)):
            if (results[i]['end_time'] !=
                    None) and (results[i]['end_time'].split(" ")[0]
                               not in displayedDates):
                displayedDates.append(results[i]['end_time'].split(" ")[0])
        if len(displayedDates) > 2:
            displayDates = True

        # remove last JobInstance if not started or not finished
        if (len(results) > 0) and ((results[-1]['start_time'] == None) or
                                   (results[-1]['end_time'] == None)):
            results.pop()

        for i in range(0, len(results)):
            if i == len(results) - 1:
                last_start_time = results[i]['start_time']
                last_items_count = results[i]['items_count']
            else:
                old_items_count.append(results[i]['items_count'])

            start_time.append(results[i]['start_time'])
            end_time.append(results[i]['end_time'])
            duration_time.append((datetime.datetime.strptime(
                results[i]['end_time'], '%Y-%m-%d %H:%M:%S') -
                                  datetime.datetime.strptime(
                                      results[i]['start_time'],
                                      '%Y-%m-%d %H:%M:%S')).total_seconds())

            if displayDates:
                end_time_short.append(end_time[-1].split(" ")[0])
            else:
                end_time_short.append(end_time[-1].split(" ")[1])

            requests_count.append(results[i]['requests_count'])
            items_count.append(results[i]['items_count'])
            if results[i]['items_count'] != 0:
                if results[i]['items_count'] - results[i][
                        'requests_count'] >= 0:
                    items_cached.append(results[i]['items_count'] -
                                        results[i]['requests_count'])
                else:
                    items_cached.append(0)
            else:
                items_cached.append(0)
            warnings_count.append(results[i]['warnings_count'])
            errors_count.append(results[i]['errors_count'])
            bytes_count.append(results[i]['bytes_count'])
            retries_count.append(results[i]['retries_count'])

            exceptions_count.append(results[i]['exceptions_count'])
            if results[i]['exceptions_count'] > 10:
                exceptions_size.append(30)
            else:
                exceptions_size.append(results[i]['exceptions_count'] * 3)

            cache_size_count.append(results[i]['cache_size_count'])
            cache_object_count.append(results[i]['cache_object_count'])

        # tricks to have a nice gauge
        if len(results) == 0:
            min_items_count = 0
            max_items_count = 100
            average_items_count = 50
        else:
            items_not_null = []
            for i in old_items_count:
                if i != 0:
                    items_not_null.append(i)
            if len(items_not_null) == 0: items_not_null = [0]
            min_items_count = min(items_not_null)
            if len(old_items_count) == 0: max_items_count = last_items_count
            else: max_items_count = max(old_items_count)
            average_items_count = sum(items_not_null) / len(items_not_null)
            if max_items_count == 0:
                min_items_count = 0
            else:
                if (min_items_count / max_items_count) > 0.8:
                    min_items_count = max_items_count * 0.8
                if (average_items_count / max_items_count
                    ) > 0.95 or max_items_count == last_items_count:
                    max_items_count = average_items_count * 1.05

        return render_template("spider_stats.html",
                               spider=spider,
                               start_time=start_time,
                               end_time=end_time,
                               end_time_short=end_time_short,
                               duration_time=duration_time,
                               last_start_time=last_start_time,
                               last_items_count=last_items_count,
                               average_items_count=average_items_count,
                               min_items_count=min_items_count,
                               max_items_count=max_items_count,
                               requests_count=requests_count,
                               items_count=items_count,
                               items_cached=items_cached,
                               warnings_count=warnings_count,
                               errors_count=errors_count,
                               bytes_count=bytes_count,
                               retries_count=retries_count,
                               exceptions_count=exceptions_count,
                               exceptions_size=exceptions_size,
                               cache_size_count=cache_size_count,
                               cache_object_count=cache_object_count)
コード例 #8
0
ファイル: controller.py プロジェクト: bsekiewicz/ScrapyKeeper
def job_dashboard(project_id):
    return render_template("job_dashboard.html",
                           job_status=JobExecution.list_jobs(project_id))
コード例 #9
0
ファイル: controller.py プロジェクト: bsekiewicz/ScrapyKeeper
 def get(self, project_id):
     return JobExecution.list_jobs(project_id)
コード例 #10
0
    def start_spider(self, job_instance):
        # prevent jobs overlapping for the same spider
        if not job_instance.overlapping and self._spider_already_running(job_instance.spider_name,
                                                                         job_instance.project_id):
            return

        project = Project.find_project_by_id(job_instance.project_id)
        spider_name = job_instance.spider_name
        # arguments = {}
        # if job_instance.spider_arguments:
        #    arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(",")))
        from collections import defaultdict
        arguments = defaultdict(list)
        if job_instance.spider_arguments:
            for k, v in list(map(lambda x: x.strip().split('=', 1), job_instance.spider_arguments.split(','))):
                arguments[k].append(v)
        threshold = 0
        daemon_size = len(self.spider_service_instances)
        if job_instance.priority == JobPriority.HIGH:
            threshold = int(daemon_size / 2)
        if job_instance.priority == JobPriority.HIGHEST:
            threshold = int(daemon_size)
        threshold = 1 if threshold == 0 else threshold
        candidates = self.spider_service_instances
        leaders = []
        if 'daemon' in arguments:
            for candidate in candidates:
                if candidate.server == arguments['daemon'][0]:
                    leaders = [candidate]
        elif not config.RUNS_IN_CLOUD:
            for candidate in candidates:
                leaders = [candidate]
        else:
            instance_ids = get_cluster_instances_ids(app)
            instance_stats = {}
            for i in instance_ids:
                ips = get_instances_private_ips(app, [i])
                if len(ips) < 1:
                    continue
                ip = ips.pop(0)
                instance_stats[ip] = get_instance_memory_usage(app, i)

            ip, _ = sorted(instance_stats.items(), key=lambda kv: kv[1] or 0).pop(0)

            # TODO optimize some better func to vote the leader
            for i in range(threshold):
                for candidate in candidates:
                    if ip in candidate.server:
                        leaders.append(candidate)

        for leader in leaders:
            service_job_id = leader.start_spider(project.project_name, spider_name, arguments)
            job_execution = JobExecution()
            job_execution.project_id = job_instance.project_id
            job_execution.service_job_execution_id = service_job_id
            job_execution.job_instance_id = job_instance.id
            job_execution.create_time = datetime.datetime.now()
            job_execution.running_on = leader.server
            db.session.add(job_execution)
            try:
                db.session.commit()
            except Exception as e:
                db.session.rollback()
                raise e
コード例 #11
0
    def _spider_already_running(self, spider_name, project_id):
        running_jobs = JobExecution.get_running_jobs_by_spider_name(spider_name, project_id)

        return len(running_jobs) > 0