class Project(Base): __tablename__ = 'sk_project' project_name = db.Column(db.String(50)) stat_url = db.Column(db.String(500)) @classmethod def load_project(cls, project_list): for project in project_list: existed_project = cls.query.filter_by( project_name=project.project_name).first() if not existed_project: db.session.add(project) db.session.commit() @classmethod def find_project_by_id(cls, project_id): return Project.query.filter_by(id=project_id).first() def to_dict(self): return { "project_id": self.id, "project_name": self.project_name, "stat_url": self.stat_url }
class WebMonitorLog(db.Model): __tablename__ = 'target_web_monitor_log' id = db.Column(db.Integer, primary_key=True) web_id = db.Column(db.String(20)) status = db.Column(db.String(20), default='正常') # 监测时的状态 monitor_date = db.Column(db.DATETIME, default=db.func.current_timestamp()) # 监测时的时间
class Project(Base): __tablename__ = 'sk_project' project_name = db.Column(db.String(50)) project_id = db.Column(db.String(16), nullable=False, index=True) def __init__(self): self.project_id = str(uuid.uuid4()).replace('-','')[:16] @classmethod def load_project(cls, project_list): for project in project_list: existed_project = cls.query.filter_by(project_name=project.project_name).first() if not existed_project: db.session.add(project) db.session.commit() @classmethod def find_project_by_id(cls, project_id): return Project.query.filter_by(project_id=project_id).first() def to_dict(self): return { "project_id": self.project_id, "project_name": self.project_name }
class TagProjectShip(Base): """ 工程与标签的关系 """ project_name = db.Column(db.String(50), unique=True) developer_name = db.Column(db.String(50)) cite_name = db.Column(db.String(50)) theme_name = db.Column(db.String(50)) industry_name = db.Column(db.String(50))
class WebMonitor(db.Model): __tablename__ = 'target_web_monitor' id = db.Column(db.Integer, primary_key=True) web_name = db.Column(db.String(20)) web_url = db.Column(db.String(100)) status = db.Column(db.String(20), default='正常') # 当前的状态 start_date = db.Column(db.DATETIME, default=db.func.current_timestamp()) end_date = db.Column(db.DATETIME, default=db.func.current_timestap()) # 最后一次测试的时间 disconnect_num = db.Column(db.Integer, default=0) # 断开的次数 disconnect_time = db.Column(db.DATETIME) # 上一次断开的时间
class Serversmachine(Base): __tablename__ = 'sk_serversmachine' server_ip = db.Column(db.String(50)) # 服务器的ip server_status = db.Column(db.String(50)) # 主从服务器运行状态, 0不可用,1可用 is_master = db.Column(db.String(50)) # 主从服务器的标志, 0从服务器,1主服务器 def to_dict(self): return dict(server_ip=self.server_ip, server_status=self.server_status, is_master=self.is_master)
class Developer(Base): __tablename__ = 'sk_developer' developer_name = db.Column(db.String(50), unique=True) # 开发人员名称 developer_role = db.Column(db.String(50)) # 开发人员性质, 正式员工、实习生、借调人员 developer_status = db.Column(db.String(50)) # 开发人员状态, 0不在职,1在职 def to_dict(self): return dict( id=self.id, developer_name=self.developer_name, developer_role=self.developer_role, developer_status=self.developer_status, )
class Videoitems(db.Model): __tablename__ = 'videoitems' id = db.Column(db.Integer, primary_key=True) title = db.Column(db.String(500), nullable=False) url = db.Column(db.String(100), nullable=False, index=True) keywords = db.Column(db.String(100), nullable=False) tags = db.Column(db.String(1000), default=[]) video_category = db.Column(db.String(50), default="其它") upload_time = db.Column(db.String(50)) spider_time = db.Column(db.String(50)) info = db.Column(db.Text) site_name = db.Column(db.String(20), default="") video_time = db.Column(db.Integer, default=0) isdownload = db.Column(db.Integer, default=0) play_count = db.Column(db.String(20), default="0") task_id = db.Column(db.String(20))
class SpiderInstance(Base): __tablename__ = 'sk_spider' spider_name = db.Column(db.String(100)) project_id = db.Column(db.INTEGER, nullable=False, index=True) @classmethod def update_spider_instances(cls, spider_instance_list): for spider_instance in spider_instance_list: existed_spider_instance = cls.query.filter_by( project_id=spider_instance.project_id, spider_name=spider_instance.spider_name).first() if not existed_spider_instance: db.session.add(spider_instance) db.session.commit() @classmethod def list_spider_by_project_id(cls, project_id): return cls.query.filter_by(project_id=project_id).all() def to_dict(self): return dict(spider_instance_id=self.id, spider_name=self.spider_name, project_id=self.project_id) @classmethod def list_spiders(cls, project_id): sql_last_runtime = ''' select * from (select a.spider_name,b.date_created from sk_job_instance as a left join sk_job_execution as b on a.id = b.job_instance_id order by b.date_created desc) as c group by c.spider_name ''' sql_avg_runtime = ''' select a.spider_name,avg(end_time-start_time) from sk_job_instance as a left join sk_job_execution as b on a.id = b.job_instance_id where b.end_time is not null group by a.spider_name ''' last_runtime_list = dict((spider_name, last_run_time) for spider_name, last_run_time in db.engine.execute(sql_last_runtime)) avg_runtime_list = dict((spider_name, avg_run_time) for spider_name, avg_run_time in db.engine.execute(sql_avg_runtime)) res = [] for spider in cls.query.filter_by(project_id=project_id).all(): last_runtime = last_runtime_list.get(spider.spider_name) res.append( dict( spider.to_dict(), **{ 'spider_last_runtime': last_runtime if last_runtime else '-', 'spider_avg_runtime': avg_runtime_list.get(spider.spider_name) })) return res
class JobInstance(Base): """ 调度任务ORM类 """ __tablename__ = 'sk_job_instance' spider_name = db.Column(db.String(100), nullable=False, index=True) # 蜘蛛名称 project_id = db.Column(db.INTEGER, nullable=False, index=True) # 爬虫项目id tags = db.Column(db.Text) # 任务的标签(通过英文逗号隔开) spider_arguments = db.Column(db.Text) # 任务执行参数, 通过英文逗号隔开 (ex.: arg1=foo,arg2=bar) priority = db.Column(db.INTEGER) # 任务优先级 desc = db.Column(db.Text) # 任务描述 cron_minutes = db.Column(db.String(20), default="0") # 周期调度时间-分钟, 默认是0 cron_hour = db.Column(db.String(20), default="*") # 周期调度时间-小时, 默认是* cron_day_of_month = db.Column(db.String(20), default="*") # 周期调度时间-天, 默认是* cron_day_of_week = db.Column(db.String(20), default="*") # 周期调度时间-星期, 默认是* cron_month = db.Column(db.String(20), default="*") # 周期调度时间-月份, 默认是* enabled = db.Column(db.INTEGER, default=0) # 0/-1 # 是否可以被周期调度 0可以 -1不可以 run_type = db.Column(db.String(20)) # periodic/onetime 调度方式 周期性 和 一次性 def to_dict(self): """ 以字典方式放回Job任务的自身信息 :return: dict Job任务的自身信息 """ return dict( job_instance_id=self.id, project_id=self.project_id, spider_name=self.spider_name, tags=self.tags.split(',') if self.tags else None, spider_arguments=self.spider_arguments, priority=self.priority, desc=self.desc, cron_minutes=self.cron_minutes, cron_hour=self.cron_hour, cron_day_of_month=self.cron_day_of_month, cron_day_of_week=self.cron_day_of_week, cron_month=self.cron_month, enabled=self.enabled == 0, run_type=self.run_type ) @classmethod def list_job_instance_by_project_id(cls, project_id): """ 通过爬虫项目id列出其所有的Job任务信息 :param project_id: 爬虫项目id :return: list Job任务信息 """ return cls.query.filter_by(project_id=project_id).all() @classmethod def find_job_instance_by_id(cls, job_instance_id): """ 通过Job任务id查询Job任务信息 :param job_instance_id: Job任务id :return: JobInstance Job任务信息 """ return cls.query.filter_by(id=job_instance_id).first()
class JobInstance(Base): __tablename__ = 'sk_job_instance' spider_name = db.Column(db.String(100), nullable=False, index=True) project_id = db.Column(db.String(16), nullable=False, index=True) job_instance_id = db.Column(db.String(16), nullable=False, index=True) tags = db.Column(db.Text) # job tag(split by , ) spider_arguments = db.Column(db.Text) # job execute arguments(split by , ex.: arg1=foo,arg2=bar) priority = db.Column(db.INTEGER) desc = db.Column(db.Text) cron_minutes = db.Column(db.String(20), default="0") cron_hour = db.Column(db.String(20), default="*") cron_day_of_month = db.Column(db.String(20), default="*") cron_day_of_week = db.Column(db.String(20), default="*") cron_month = db.Column(db.String(20), default="*") enabled = db.Column(db.INTEGER, default=0) # 0/-1 run_type = db.Column(db.String(20)) # periodic/onetime def __init__(self): self.job_instance_id = str(uuid.uuid4()).replace('-','')[:16] def to_dict(self): return dict( job_instance_id=self.job_instance_id, spider_name=self.spider_name, tags=self.tags.split(',') if self.tags else None, spider_arguments=self.spider_arguments, priority=self.priority, desc=self.desc, cron_minutes=self.cron_minutes, cron_hour=self.cron_hour, cron_day_of_month=self.cron_day_of_month, cron_day_of_week=self.cron_day_of_week, cron_month=self.cron_month, enabled=self.enabled == 0, run_type=self.run_type ) @classmethod def list_job_instance_by_project_id(cls, project_id): return cls.query.filter_by(project_id=project_id).all() @classmethod def find_job_instance_by_id(cls, job_instance_id): return cls.query.filter_by(job_instance_id=job_instance_id).first()
class User(db.Model): __tablename__ = 'user' id = db.Column(db.Integer, primary_key=True) user_name = db.Column(db.String(20), unique=True) password_hash = db.Column(db.String(128)) confirmed = db.Column(db.Boolean, default=0) # 保护密码字段 @property def password(self): raise AttributeError('密码是不可读属性') # 设置密码,加密存储 @password.setter def password(self, password): self.password_hash = generate_password_hash(password) # 密码校验 def verify_password(self, password): return check_password_hash(self.password_hash, password) # 生成token def generate_auth_token(self, expiration=18000): s = Serializer(app.config['SECRET_KEY'], expires_in=expiration) return s.dumps({'id': self.id}) # 验证用户token @staticmethod def verify_auth_token(token): s = Serializer(app.config['SECRET_KEY']) try: data = s.loads(token) except SignatureExpired: return None # valid token, but expired except BadSignature: return None # invalid token user = User.query.get(data['id']) return user
class Project(Base): """ Project爬虫项目ORM类 """ __tablename__ = 'sk_project' project_name = db.Column(db.String(50), unique=True) applicant = db.Column(db.String(50)) # 申请人 developers = db.Column(db.String(50)) # 项目的开发者 for_project = db.Column(db.String(50)) # 提出需求的项目 project_cate = db.Column(db.String(100)) # 爬虫分类 project_alias = db.Column(db.String(100)) # 项目的备注 is_msd = db.Column(db.String(50)) # 是否是主从分布式爬虫 0 单机爬虫 1 分布式爬虫 @classmethod def load_project(cls, project_list): """ 将爬虫项目列表里面的爬虫项目添加进入数据库 :param project_list: 爬虫项目列表 :return: """ for project in project_list: existed_project = cls.query.filter_by( project_name=project.project_name).first() if not existed_project: db.session.add(project) db.session.commit() @classmethod def find_project_by_id(cls, project_id): """ 根据爬虫项目id查找爬虫项目信息 :param project_id: 爬虫项目id :return: """ return Project.query.filter_by(id=project_id).first() def to_dict(self): return dict(project_id=self.id, project_name=self.project_name, applicant=self.applicant, developers=self.developers, for_project=self.for_project, project_alias=self.project_alias, project_cate=self.project_cate, create_time=str(self.date_created), is_msd=self.is_msd)
class SpiderInstance(Base): __tablename__ = 'sk_spider' spider_name = db.Column(db.String(100)) project_id = db.Column(db.INTEGER, nullable=False, index=True) @classmethod def update_spider_instances(cls, spider_instance_list): for spider_instance in spider_instance_list: existed_spider_instance = cls.query.filter_by(project_id=spider_instance.project_id, spider_name=spider_instance.spider_name).first() if not existed_spider_instance: db.session.add(spider_instance) db.session.commit() @classmethod def list_spider_by_project_id(cls, project_id): return cls.query.filter_by(project_id=project_id).all() def to_dict(self): return dict(spider_instance_id=self.id, spider_name=self.spider_name, project_id=self.project_id)
class JobExecution(Base): __tablename__ = 'sk_job_execution' project_id = db.Column(db.INTEGER, nullable=False, index=True) service_job_execution_id = db.Column(db.String(50), nullable=False, index=True) job_instance_id = db.Column(db.INTEGER, nullable=False, index=True) create_time = db.Column(db.DATETIME) start_time = db.Column(db.DATETIME) end_time = db.Column(db.DATETIME) running_status = db.Column(db.INTEGER, default=SpiderStatus.PENDING) running_on = db.Column(db.Text) def to_dict(self): job_instance = JobInstance.query.filter_by(id=self.job_instance_id).first() return { 'project_id': self.project_id, 'job_execution_id': self.id, 'job_instance_id': self.job_instance_id, 'service_job_execution_id': self.service_job_execution_id, 'create_time': self.create_time.strftime('%Y-%m-%d %H:%M:%S') if self.create_time else None, 'start_time': self.start_time.strftime('%Y-%m-%d %H:%M:%S') if self.start_time else None, 'end_time': self.end_time.strftime('%Y-%m-%d %H:%M:%S') if self.end_time else None, 'running_status': self.running_status, 'running_on': self.running_on, 'job_instance': job_instance.to_dict() if job_instance else {} } @classmethod def find_job_by_service_id(cls, service_job_execution_id): return cls.query.filter_by(service_job_execution_id=service_job_execution_id).first() @classmethod def list_job_by_service_ids(cls, service_job_execution_ids): return cls.query.filter(cls.service_job_execution_id.in_(service_job_execution_ids)).all() @classmethod def list_uncomplete_job(cls): return cls.query.filter(cls.running_status != SpiderStatus.FINISHED, cls.running_status != SpiderStatus.CANCELED).all() @classmethod def list_jobs(cls, project_id, each_status_limit=100): result = {} result['PENDING'] = [job_execution.to_dict() for job_execution in JobExecution.query.filter_by(project_id=project_id, running_status=SpiderStatus.PENDING).order_by( desc(JobExecution.date_modified)).limit(each_status_limit)] result['RUNNING'] = [job_execution.to_dict() for job_execution in JobExecution.query.filter_by(project_id=project_id, running_status=SpiderStatus.RUNNING).order_by( desc(JobExecution.date_modified)).limit(each_status_limit)] result['COMPLETED'] = [job_execution.to_dict() for job_execution in JobExecution.query.filter(JobExecution.project_id == project_id).filter( (JobExecution.running_status == SpiderStatus.FINISHED) | ( JobExecution.running_status == SpiderStatus.CANCELED)).order_by( desc(JobExecution.date_modified)).limit(each_status_limit)] return result @classmethod def list_run_stats_by_hours(cls, project_id): result = {} hour_keys = [] last_time = datetime.datetime.now() - datetime.timedelta(hours=23) last_time = datetime.datetime(last_time.year, last_time.month, last_time.day, last_time.hour) for hour in range(23, -1, -1): time_tmp = datetime.datetime.now() - datetime.timedelta(hours=hour) hour_key = time_tmp.strftime('%Y-%m-%d %H:00:00') hour_keys.append(hour_key) result[hour_key] = 0 # init for job_execution in JobExecution.query.filter(JobExecution.project_id == project_id, JobExecution.date_created >= last_time).all(): hour_key = job_execution.create_time.strftime('%Y-%m-%d %H:00:00') result[hour_key] += 1 return [dict(key=hour_key, value=result[hour_key]) for hour_key in hour_keys]
class DataCounts(Base): __tablename__ = 'sk_data_count' project_name = db.Column(db.String(255)) # 工程名 **必须与上传到爬虫平台的英文工程名同名 developers = db.Column(db.String(255)) address = db.Column(db.String(255)) db_name = db.Column(db.String(255)) table_name = db.Column(db.String(255)) number = db.Column(db.String(255)) image_number = db.Column(db.String(255)) video_number = db.Column(db.String(255)) audio_number = db.Column(db.String(255)) file_number = db.Column(db.String(255)) image_size = db.Column(db.String(255)) video_size = db.Column(db.String(255)) audio_size = db.Column(db.String(255)) file_size = db.Column(db.String(255)) @classmethod def to_dict(cls, obj): return dict(projectName=obj.project_name, date=str(obj.date_created).split()[0], address=obj.address, dbName=obj.db_name, tableName=obj.table_name, image="{} / {}".format(obj.image_number, obj.image_size), video="{} / {}".format(obj.video_number, obj.video_size), audio="{} / {}".format(obj.audio_number, obj.audio_size), files="{} / {}".format(obj.file_number, obj.file_size), total=obj.number) @classmethod def decimal2int(cls, data): data_list = [] index_list = [] for i in data: data_list.append(i[1]) index_list.append(i[0]) return index_list, data_list @classmethod def array2dict_or_list(cls, data): data_list = [] for i in data: if len(i) > 2: index_list = list(i) data_list.append(index_list) else: data_dict = {} data_dict['value'] = i[1] data_dict['name'] = i[0] data_list.append(data_dict) return data_list @classmethod def insert_record(cls, data_dict): try: record = cls( project_name=data_dict.get('project_name'), address=data_dict.get('address'), db_name=data_dict.get('db_name'), table_name=data_dict.get('table_name'), number=data_dict.get('number'), image_number=data_dict.get('image_number'), video_number=data_dict.get('video_number'), audio_number=data_dict.get('audio_number'), file_number=data_dict.get('file_number'), image_size=data_dict.get('image_size'), video_size=data_dict.get('video_size'), audio_size=data_dict.get('audio_size'), file_size=data_dict.get('file_size'), ) db.session.add(record) db.session.commit() return True except: return False @classmethod def get_info(cls, manager_person, start_date, end_date, page_index, page_size): data = [] if manager_person == "所有人": objs = cls.query.filter( func.date_format(cls.date_created, '%Y-%m-%d') >= start_date, func.date_format(cls.date_created, '%Y-%m-%d') <= end_date).paginate(page_index, page_size, False).items else: objs = cls.query.filter( func.date_format(cls.date_created, '%Y-%m-%d') >= start_date, func.date_format(cls.date_created, '%Y-%m-%d') <= end_date, cls.developers.contains(manager_person)).paginate( page_index, page_size, False).items for obj in objs: data.append(cls.to_dict(obj)) return data @classmethod def get_data_num(cls, manager_person, start_date, end_date): if manager_person == "所有人": return cls.query.filter( func.date_format(cls.date_created, '%Y-%m-%d') >= start_date, func.date_format(cls.date_created, '%Y-%m-%d') <= end_date).count() else: return cls.query.filter( func.date_format(cls.date_created, '%Y-%m-%d') >= start_date, func.date_format(cls.date_created, '%Y-%m-%d') <= end_date, cls.developers.contains(manager_person)).count()
class SpiderInstance(Base): __tablename__ = 'sk_spider' spider_name = db.Column(db.String(100)) project_id = db.Column(db.INTEGER, nullable=False, index=True) @classmethod def update_spider_instances(cls, project_id, spider_instance_list): for spider_instance in spider_instance_list: existed_spider_instance = cls.query.filter_by(project_id=project_id, spider_name=spider_instance.spider_name).first() if not existed_spider_instance: db.session.add(spider_instance) db.session.commit() for spider in cls.query.filter_by(project_id=project_id).all(): existed_spider = any( spider.spider_name == s.spider_name for s in spider_instance_list ) if not existed_spider: db.session.delete(spider) db.session.commit() @classmethod def list_spider_by_project_id(cls, project_id): return cls.query.filter_by(project_id=project_id).all() def to_dict(self): return dict(spider_instance_id=self.id, spider_name=self.spider_name, project_id=self.project_id) @classmethod def list_spiders(cls, project_id): # TODO 这到底啥意思?用spider_name区分? # 要区分不同数据库的语法!为postgres增加 url = app.config.get('SQLALCHEMY_DATABASE_URI') if url.startswith('sqlite'): sql_last_runtime = ''' select * from (select a.spider_name,b.date_created from sk_job_instance as a left join sk_job_execution as b on a.id = b.job_instance_id order by b.date_created desc) as c group by c.spider_name ''' else: sql_last_runtime = ''' select c.spider_name,c.date_created from (select a.spider_name,b.date_created,row_number() over(partition by a.spider_name order by b.date_created desc)rn from sk_job_instance as a left join sk_job_execution as b on a.id = b.job_instance_id )as c where c.rn=1; ''' sql_avg_runtime = ''' select a.spider_name,avg(end_time-start_time) from sk_job_instance as a left join sk_job_execution as b on a.id = b.job_instance_id where b.end_time is not null group by a.spider_name ''' last_runtime_list = dict( (spider_name, last_run_time) for spider_name, last_run_time in db.engine.execute(sql_last_runtime)) avg_runtime_list = dict( (spider_name, avg_run_time) for spider_name, avg_run_time in db.engine.execute(sql_avg_runtime)) res = [] for spider in cls.query.filter_by(project_id=project_id).all(): last_runtime = last_runtime_list.get(spider.spider_name) res.append(dict(spider.to_dict(), **{'spider_last_runtime': last_runtime if last_runtime else '-', 'spider_avg_runtime': avg_runtime_list.get(spider.spider_name) })) return res
class SpiderInstance(Base): """ 蜘蛛spider ORM类 """ __tablename__ = 'sk_spider' spider_name = db.Column(db.String(100)) project_id = db.Column(db.INTEGER, nullable=False, index=True) spider_name_slave = db.Column(db.String(100)) @classmethod def update_spider_instances(cls, project_id, spider_instance_list): """ 根据爬虫项目爬虫项目Project id及蜘蛛信息列表, 更新爬虫项目爬虫项目中的Spider蜘蛛信息 :param project_id: 爬虫项目ID :param spider_instance_list: Spider蜘蛛信息列表 :return: """ # 如果数据库中没有爬虫项目ID及Spider这条记录就往数据库插入该记录 for spider_instance in spider_instance_list: existed_spider_instance = cls.query.filter_by(project_id=project_id, spider_name=spider_instance.spider_name).first() if not existed_spider_instance: db.session.add(spider_instance) db.session.commit() # 从数据库中取出某个爬虫项目下所有的Spider蜘蛛信息 # 如果数据库中的Spider蜘蛛不在提交过来的蜘蛛信息列表里面则从数据库中删除该蜘蛛信息 for spider in cls.query.filter_by(project_id=project_id).all(): existed_spider = any( spider.spider_name == s.spider_name for s in spider_instance_list ) if not existed_spider: db.session.delete(spider) db.session.commit() @classmethod def list_spider_by_project_id(cls, project_id): """ 通过爬虫项目id列出某个爬虫项目在sk_spider表下的所有蜘蛛信息 :param project_id: 爬虫项目id :return: 某个爬虫项目id在sk_spider表下的所有蜘蛛信息 """ return cls.query.filter_by(project_id=project_id).all() def to_dict(self): return dict(spider_instance_id=self.id, spider_name=self.spider_name, spider_name_slave=self.spider_name_slave, project_id=self.project_id) @classmethod def list_spiders(cls, project_id): """ 通过爬虫项目id列出某个爬虫项目下的所有蜘蛛及其任务运行信息(蜘蛛最新的任务的创建时间、平均运行时间) :param project_id: 爬虫项目id :return: list 某个爬虫项目id下的所有蜘蛛及其任务运行信息 """ # 该sql语句用于获取所有蜘蛛最新的任务的创建时间 # 返回 [(蜘蛛名称1, 最新的任务创建时间), (蜘蛛名称2, 最新的任务创建时间)] sql_last_runtime = ''' select * from (select a.spider_name,b.date_created from sk_job_instance as a left join sk_job_execution as b on a.id = b.job_instance_id order by b.date_created desc) as c group by c.spider_name ''' # 该sql语句用于获取所有蜘蛛的平均运行时间 # 返回 [(蜘蛛名称1, 任务平均运行时间), (蜘蛛名称2, 任务平均运行时间)] # ****** 这里有个问题, 实际把sql复制执行的时候, 返回的平均运行时间都是0, 待进一步解决 ********** sql_avg_runtime = ''' select a.spider_name,avg(end_time-start_time) from sk_job_instance as a left join sk_job_execution as b on a.id = b.job_instance_id where b.end_time is not null group by a.spider_name ''' last_runtime_list = dict( (spider_name, last_run_time) for spider_name, last_run_time in db.engine.execute(sql_last_runtime)) avg_runtime_list = dict( (spider_name, avg_run_time) for spider_name, avg_run_time in db.engine.execute(sql_avg_runtime)) res = [] for spider in cls.query.filter_by(project_id=project_id).all(): last_runtime = last_runtime_list.get(spider.spider_name) res.append(dict(spider.to_dict(), **{'spider_last_runtime': last_runtime if last_runtime else '-', 'spider_avg_runtime': avg_runtime_list.get(spider.spider_name) })) return res
class DeveloperProject(Base): name = db.Column(db.String(50), unique=True)
class IndustryProject(Base): """" 行业 """ name = db.Column(db.String(50), unique=True)
class JobInstance(Base): __tablename__ = 'job_instance' '''爬虫任务表''' job_name = db.Column(db.String(50)) # 任务名称 spider_type = db.Column(db.String(50)) # 采集形式 keywords = db.Column(db.String(50)) # 关键词 project_id = db.Column(db.INTEGER, nullable=False, index=True) # 工程id 可以用来查询目标网站(工程名可以用目标网站命名) spider_name = db.Column(db.String(100), nullable=False, index=True) # 采集形式(关键词采集/板块采集) run_time = db.Column(db.String(20)) # 长期/设定区间 start_date = db.Column(db.Date, default=db.func.current_timestamp()) # 任务开始时间 end_date = db.Column(db.Date, default=db.func.current_timestamp()) # 任务结束时间 tags = db.Column(db.Text) # job tag(split by , ) spider_freq = db.Column(db.Float, default=0) # 采集频率,以天为单位,需要将其分解映射为满足cron格式需求 run_type = db.Column(db.String(20)) # periodic/onetime upload_time_type = db.Column(db.String(20)) # 设置视频上传时间的方式 upload_time_start_date = db.Column( db.Date, default=db.func.current_timestamp()) # 上传时间开始 upload_time_end_date = db.Column( db.Date, default=db.func.current_timestamp()) # 上传时间结束 video_time_short = db.Column(db.Integer) # 视频最短时间 video_time_long = db.Column(db.Integer) # 视频最长时间 spider_arguments = db.Column( db.Text) # job execute arguments(split by , ex.: arg1=foo,arg2=bar) priority = db.Column(db.INTEGER) # 优先级 cron_minutes = db.Column(db.String(20), default="0") cron_hour = db.Column(db.String(20), default="*") cron_day_of_month = db.Column(db.String(20), default="*") cron_day_of_week = db.Column(db.String(20), default="*") cron_month = db.Column(db.String(20), default="*") enabled = db.Column(db.INTEGER, default=0) # 0/-1/1 # 任务状态 user_id = db.Column(db.INTEGER) # 创建者id pri = db.Column(db.String(20)) # 紧急\常规 def to_dict(self): return { 'id': self.id, 'date_created': self.date_created.strftime('%Y-%m-%d') if self.date_created else None, 'job_instance_id': self.id, 'job_name': self.job_name, 'keywords': self.keywords, # spider_type=self.spider_type, "project_id": self.project_id, 'spider_name': self.spider_name, 'run_time': self.run_time, 'start_date': self.start_date.strftime('%Y-%m-%d') if self.start_date else None, 'end_date': self.end_date.strftime('%Y-%m-%d') if self.end_date else None, 'tags': self.tags.split(',') if self.tags else None, 'spider_freq': self.spider_freq, 'run_type': self.run_type, 'upload_time_type': self.upload_time_type, 'upload_time_start_date': self.upload_time_start_date.strftime('%Y-%m-%d') if self.upload_time_start_date else None, 'upload_time_end_date': self.upload_time_end_date.strftime('%Y-%m-%d') if self.upload_time_end_date else None, 'spider_arguments': self.spider_arguments, 'video_time_short': self.video_time_short, 'video_time_long': self.video_time_long, 'priority': self.priority, # desc=self.desc, 'cron_minutes': self.cron_minutes, 'cron_hour': self.cron_hour, 'cron_day_of_month': self.cron_day_of_month, 'cron_day_of_week': self.cron_day_of_week, 'cron_month': self.cron_month, 'enabled': self.enabled == 0, 'user_id': self.user_id } @classmethod def list_job_instance_by_project_id(cls, project_id): return cls.query.filter_by(project_id=project_id).all() @classmethod def find_job_instance_by_id(cls, job_instance_id): return cls.query.filter_by(id=job_instance_id).first()
class ThemeProject(Base): """ 主题 """ name = db.Column(db.String(50), unique=True)
class CiteProject(Base): """ 需求 """ name = db.Column(db.String(50), unique=True)
class RunningJob(Base): __tablename__ = 'running_job' spider_random_id = db.Column(db.String(50), nullable=False, index=True)
class JobExecution(Base): """ 执行任务ORM类 """ __tablename__ = 'sk_job_execution' project_id = db.Column(db.INTEGER, nullable=False, index=True) # 爬虫项目id service_job_execution_id = db.Column(db.String(255), nullable=False, index=True) # 任务执行历史id job_instance_id = db.Column(db.INTEGER, nullable=False, index=True) # 对应的执行的调度任务id create_time = db.Column(db.DATETIME) # 该条历史任务的创建时间 start_time = db.Column(db.DATETIME) # 执行任务开始时间 end_time = db.Column(db.DATETIME) # 执行任务结束时间 running_status = db.Column(db.INTEGER, default=SpiderStatus.PENDING) # 执行状态 running_on = db.Column(db.Text) # 执行主机 'localhost:6800' def to_dict(self): """ 以字典方式放回Job任务的自身信息 :return: dict Job任务的自身信息 """ job_instance = JobInstance.query.filter_by(id=self.job_instance_id).first() return { 'project_id': self.project_id, 'job_execution_id': self.id, 'job_instance_id': self.job_instance_id, 'service_job_execution_id': self.service_job_execution_id, 'create_time': self.create_time.strftime('%Y-%m-%d %H:%M:%S') if self.create_time else None, 'start_time': self.start_time.strftime('%Y-%m-%d %H:%M:%S') if self.start_time else None, 'end_time': self.end_time.strftime('%Y-%m-%d %H:%M:%S') if self.end_time else None, 'running_status': self.running_status, 'running_on': self.running_on, 'job_instance': job_instance.to_dict() if job_instance else {} } @classmethod def find_job_by_service_id(cls, service_job_execution_id): return cls.query.filter_by(service_job_execution_id=service_job_execution_id).first() @classmethod def list_job_by_service_ids(cls, service_job_execution_ids): return cls.query.filter(cls.service_job_execution_id.in_(service_job_execution_ids)).all() @classmethod def list_uncomplete_job(cls): return cls.query.filter(cls.running_status != SpiderStatus.FINISHED, cls.running_status != SpiderStatus.CANCELED).all() @classmethod def list_jobs(cls, project_id, each_status_limit=100): """ 通过爬虫项目id列出前n条 等待执行、正在执行、执行完成的任务信息 :param project_id: 工程id :param each_status_limit: 每个执行状态返回的任务条数, 默认为100条 :return: dict 每个执行状态的任务信息 """ result={} result['PENDING'] = [job_execution.to_dict() for job_execution in JobExecution.query.filter_by(project_id=project_id, running_status=SpiderStatus.PENDING).order_by( desc(JobExecution.date_modified)).limit(each_status_limit)] result['RUNNING'] = [job_execution.to_dict() for job_execution in JobExecution.query.filter_by(project_id=project_id, running_status=SpiderStatus.RUNNING).order_by( desc(JobExecution.date_modified)).limit(each_status_limit)] result['COMPLETED'] = [job_execution.to_dict() for job_execution in JobExecution.query.filter(JobExecution.project_id == project_id).filter( (JobExecution.running_status == SpiderStatus.FINISHED) | ( JobExecution.running_status == SpiderStatus.CANCELED)).order_by( desc(JobExecution.date_modified)).limit(each_status_limit)] return result @classmethod def list_run_stats_by_hours(cls, project_id): """ 列出一个工程在24小时内每个小时的蜘蛛运行状态, 用于前端可视化展现 :param project_id: 工程id :return: list 每个小时内的运行状态列表 ex: [{'00:00': 6, '01:00': 3}] """ result = {} hour_keys = [] last_time = datetime.datetime.now() - datetime.timedelta(hours=23) last_time = datetime.datetime(last_time.year, last_time.month, last_time.day, last_time.hour) for hour in range(23, -1, -1): time_tmp = datetime.datetime.now() - datetime.timedelta(hours=hour) hour_key = time_tmp.strftime('%Y-%m-%d %H:00:00') hour_keys.append(hour_key) result[hour_key] = 0 # init for job_execution in JobExecution.query.filter(JobExecution.project_id == project_id, JobExecution.date_created >= last_time).all(): hour_key = job_execution.create_time.strftime('%Y-%m-%d %H:00:00') result[hour_key] += 1 return [dict(key=hour_key, value=result[hour_key]) for hour_key in hour_keys]