def __init__(self): self.path_tpl_entity = PathUtil.root('templates') + '/entity.tpl' self.path_tpl_mapper = PathUtil.root('templates') + '/mapper.tpl' self.project_java = 'test' self.mysql_util = MysqlUtil({ 'ip': 'localhost', 'user': '******', 'passwd': '123456', 'db': 'tacomall' }) self.all_tables = [] self._query_all_tables()
def __init__(self, root_path): self.root_path = root_path self.SELECT_SQL = "SELECT page_no, loc_url,status FROM illust WHERE illust_id={}" self.RESET_ILLUSTER_SQL = "UPDATE illust SET status = 1 WHERE illuster_id = {}" self.RESET_ILLUST_SQL = "UPDATE illust SET status = 1 WHERE illuster_id = {} AND illust_id NOT IN ({})" self.FILTER_SQL = "SELECT illust_id FROM illust WHERE status = 10 AND illust_id in ({})" # 设priority = 6代表已经检查完毕 self.MAKE_ILLUSTER_STATUS_DONE_SQL = "UPDATE illuster SET priority = 6 WHERE illuster_id = {}" self.CHECK_IF_DONE = 'SELECT priority FROM illuster WHERE illuster_id={}' self.GET_DONE_ILLUSTER = 'SELECT illuster_id FROM illuster WHERE priority=6' self.db_util = MysqlUtil() self.logger = Log(__name__, log_cate='checker').get_log() self.before_illuster_id = None
class CommentDAO: def __init__(self): self.session = MysqlUtil().get_session() def new_comment(self, art_id, content): if ArticleDao().get_article_by_id(art_id) is None: return False, u'评论的文章不存在' else: comment = Comment() comment.article_id = art_id comment.comment = content comment.user = current_user.username self.session.add(comment) self.session.commit() return True, u'评论成功!' def get_article_comments_num(self, art_id): return self.session.query(Comment) \ .filter_by(article_id=art_id).count() def get_comments_by_article(self, article_id, page=0, size=20): article_num = self.get_article_comments_num(article_id) if page * size > article_num: return None, u'分页数超出最大值' info = u'more' if (page + 1) * size < article_num else u'nomore' return self.session.query(Comment)\ .filter_by(article_id=article_id)\ .order_by(desc(Comment.modified_time)).offset(size * page).limit(size).all(), info
def __init__(self): # 加载配置文件 self.sql_4_update = 'update illuster set priority = 0, modify_time = "{}" WHERE illuster_id = {}' self.load_logger() self.load_config() self.pixiv_util = PixivUtil() self.db_util = DbUtil() self.pool = threadpool.ThreadPool(self.thread_num) self.WAITING_SQL = 'select illuster_id from illuster WHERE priority > 0 ' \ ' AND illuster_id!=11 ORDER BY priority DESC LIMIT {} ' ssl._create_default_https_context = ssl._create_unverified_context requests.packages.urllib3.disable_warnings(InsecureRequestWarning) # 测试。用来解决, 'ssl3_read_bytes', 'sslv3 alert bad record mac') requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL'
def __init__(self): self.init_pattern() self.load_config() self.logger = Log(__name__).get_log() self.GET_KEY_PAGE = "https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index" self.LOGIN_PAGE = "https://accounts.pixiv.net/api/login?lang=zh" # 登陆页面 self.UGOIRA_URL = 'https://www.pixiv.net/ajax/illust/{illust_id}/ugoira_meta' # 动图url,0填写动图的id self.URL_4_GET_ALL_WORK_ID = "https://www.pixiv.net/ajax/user/{userId}/profile/all" # 画师的所有作品信息 self.URL_4_ILLUSTER_MAIN_PAGE = "https://www.pixiv.net/ajax/user/{}/profile/top" # 画师主页24个作品信息,少量画师信息 self.URL_4_GET_ILLUSTER_INFO = "https://www.pixiv.net/ajax/user/{}?full=1" # 用于获得画师信息 self.URL_ILLUST_PAGE = "https://www.pixiv.net/member_illust.php?mode=medium&illust_id={illust_id}" # 插画页面 self.URL_ILLUST_PAGE = "https://www.pixiv.net/ajax/illust/{illust_id}" # 插画页面 self.REFERER = "https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index" self.USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" self.headers = {'Referer': self.REFERER, 'User-Agent': self.USER_AGENT} # 方法二:设置请求失败之后重复请求次数 # requests.adapters.DEFAULT_RETRIES = self.max_retries self.session = requests.Session() if self.cookie is not None: self.session.cookies.set("authentication", self.cookie) if self.proxies['http'] is not None or self.proxies[ 'https'] is not None: self.session.proxies = self.proxies # 设置session默认代理 # 方法1:设置请求失败之后重复请求次数 request_retry = requests.adapters.HTTPAdapter( max_retries=self.max_retries) self.session.mount('https://', request_retry) self.session.mount('http://', request_retry) # self.session.keep_alive = False self.spider_util = SpiderUtil() # 操作数据库 self.db_util = DbUtil() self.sql_4_insert = 'insert into illust(title,url,illust_id,illuster_id,page_no,status,`restrict`,x_restrict)' \ 'values ( %s, %s, %s, %s, %s, 0, %s, %s)' self.sql_4_insert_4_ugoira = 'insert into illust(title,url,illust_id,illuster_id,page_no,status)values( %s, %s, %s, %s, %s,444)' self.sql_4_insert_2_done = 'insert into illust(title,url,illust_id,illuster_id,page_no,status)values( %s, %s, %s, %s, 0, 10)'
def init_db(): conf = ConfigUtil() host = conf.get_config(conf='host', section='db_info') port = conf.get_config(conf='port', section='db_info') username = conf.get_config(conf='username', section='db_info') password = conf.get_config(conf='password', section='db_info') database = conf.get_config(conf='database', section='db_info') conn = MySQLdb.connect(host=host, user=username, passwd=password, port=int(port)) cur = conn.cursor() sql = 'CREATE DATABASE IF NOT EXISTS `{db}` DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;'.format( db=database) cur.execute(sql) conn.commit() cur.close() conn.close() session = MysqlUtil().get_session() BaseModel.metadata.create_all(MysqlUtil().get_engine()) email = conf.get_config('email', 'admin_account') if session.query(User).filter_by(email=email).first() is None: password = conf.get_config('password', 'admin_account') username = conf.get_config('username', 'admin_account') user = User(username=username, password=generate_password_hash(password), email=email, status=1, is_admin=1, register_time=datetime.datetime.now(), last_login_time=datetime.datetime.now()) session.add(user) session.commit()
def __init__(self): user_dao = MysqlUtil() self.session = user_dao.get_session()
class PixivUtil: def __init__(self): self.init_pattern() self.load_config() self.logger = Log(__name__).get_log() self.GET_KEY_PAGE = "https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index" self.LOGIN_PAGE = "https://accounts.pixiv.net/api/login?lang=zh" # 登陆页面 self.UGOIRA_URL = 'https://www.pixiv.net/ajax/illust/{illust_id}/ugoira_meta' # 动图url,0填写动图的id self.URL_4_GET_ALL_WORK_ID = "https://www.pixiv.net/ajax/user/{userId}/profile/all" # 画师的所有作品信息 self.URL_4_ILLUSTER_MAIN_PAGE = "https://www.pixiv.net/ajax/user/{}/profile/top" # 画师主页24个作品信息,少量画师信息 self.URL_4_GET_ILLUSTER_INFO = "https://www.pixiv.net/ajax/user/{}?full=1" # 用于获得画师信息 self.URL_ILLUST_PAGE = "https://www.pixiv.net/member_illust.php?mode=medium&illust_id={illust_id}" # 插画页面 self.URL_ILLUST_PAGE = "https://www.pixiv.net/ajax/illust/{illust_id}" # 插画页面 self.REFERER = "https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index" self.USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" self.headers = {'Referer': self.REFERER, 'User-Agent': self.USER_AGENT} # 方法二:设置请求失败之后重复请求次数 # requests.adapters.DEFAULT_RETRIES = self.max_retries self.session = requests.Session() if self.cookie is not None: self.session.cookies.set("authentication", self.cookie) if self.proxies['http'] is not None or self.proxies[ 'https'] is not None: self.session.proxies = self.proxies # 设置session默认代理 # 方法1:设置请求失败之后重复请求次数 request_retry = requests.adapters.HTTPAdapter( max_retries=self.max_retries) self.session.mount('https://', request_retry) self.session.mount('http://', request_retry) # self.session.keep_alive = False self.spider_util = SpiderUtil() # 操作数据库 self.db_util = DbUtil() self.sql_4_insert = 'insert into illust(title,url,illust_id,illuster_id,page_no,status,`restrict`,x_restrict)' \ 'values ( %s, %s, %s, %s, %s, 0, %s, %s)' self.sql_4_insert_4_ugoira = 'insert into illust(title,url,illust_id,illuster_id,page_no,status)values( %s, %s, %s, %s, %s,444)' self.sql_4_insert_2_done = 'insert into illust(title,url,illust_id,illuster_id,page_no,status)values( %s, %s, %s, %s, 0, 10)' def get_session(self): if self.login(): return self.session else: self.logger.error("登陆失败,请检查账号密码是否正确和网络是否连通") return 'error' def load_config(self): config = ConfigureUtil('config/config.conf') http_proxy = config.get('proxy', 'http', is_error=True) https_proxy = config.get('proxy', 'https', is_error=True) self.proxies = {'http': http_proxy, 'https': https_proxy} # 重试次数必须是数值类型 self.max_retries = config.get('app', 'max_retries', 'int') # 超时时间, 单位是秒 self.timeout = config.get('app', 'time_out', type_="int", is_error=True, default=20) self.username = config.get('account', 'username') self.password = config.get('account', 'password') # self.cookie = "p_ab_id=0; p_ab_id_2=3; login_ever=yes; a_type=0; b_type=1; first_visit_datetime_pc=2018-06-06+10%3A53%3A26; _ga=GA1.2.1555034815.1497772914; p_ab_d_id=1974173592; yuid_b=NXcmaYM; module_orders_mypage=%5B%7B%22name%22%3A%22sketch_live%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22following_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22tag_follow%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22recommended_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22everyone_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22mypixiv_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22fanbox%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22featured_tags%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22contests%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22user_events%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22sensei_courses%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22spotlight%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22booth_follow_items%22%2C%22visible%22%3Atrue%7D%5D; ki_r=; __utmc=235335808; OX_plg=pm; c_type=26; __utmv=235335808.|2=login%20ever=yes=1^3=plan=normal=1^5=gender=male=1^6=user_id=27971320=1^9=p_ab_id=0=1^10=p_ab_id_2=3=1^11=lang=zh=1; gsScrollPos-422=; _td=5e2a3ce8-e5ca-4e1c-9bc5-4188deff8d9d; ki_s=197685%3A0.0.0.0.0; gsScrollPos-73=; gsScrollPos-74=0; __utmz=235335808.1558931621.48.3.utmcsr=accounts.pixiv.net|utmccn=(referral)|utmcmd=referral|utmcct=/login; limited_ads=%7B%22responsive%22%3A%22%22%7D; PHPSESSID=27971320_a3ba646dd6d6fe402a566008ca109afe; privacy_policy_agreement=1; ki_t=1539786814040%3B1564299004430%3B1564299004430%3B7%3B11; categorized_tags=0Gd_u69FDa~0VZuk18GJB~0roTzTKxJy~7cMRrOPRjW~BU9SQkS-zU~EQHKvBDRBz~IVwLyT8B6k~Ig5OcZugU6~OEXgaiEbRa~OT-C6ubi9i~RcahSSzeRf~RkTaP3d-E6~_-agXPKuAQ~b8b4-hqot7~cpt_Nk5mjc~jYnWl04aAC~l2rugVKl6u~r70NVOGJ5H~xlfjJKgpwx~y8GNntYHsi; __utmt=1; __utma=235335808.1555034815.1497772914.1564650150.1564650214.63; tag_view_ranking=0xsDLqCEW6~Ie2c51_4Sp~BU9SQkS-zU~y8GNntYHsi~RTJMXD26Ak~AI_aJCDFn0~RcahSSzeRf~i83OPEGrYw~8HRshblb4Q~4-_9de7LBH~cpt_Nk5mjc~q3eUobDMJW~skx_-I2o4Y~faHcYIP1U0~404yEt28rv~fFjokb4ZCF~-YeeMY1Yjs~FqVQndhufZ~tgP8r-gOe_~0HA6x-6rNd~Ow9mLSvmxK~KN7uxuR89w~5oPIfUbtd6~NpsIVvS-GF~gooMLQqB9a~Lt-oEicbBr~gpglyfLkWs~jYnWl04aAC~VIOKa7rioU~HBlflqJjBZ~zyKU3Q5L4C~TWrozby2UO~BB4jge2y2O~nyZIqZI1jx~MM6RXH_rlN~3gc3uGrU1V~LJo91uBPz4~laE3IylUE6~_hSAdpN9rx~n7YxiukgPF~l2rugVKl6u~A0c1GtjhvT~KhVXu5CuKx~WlKkwEuUi0~VbPCYJXdEP~2-RXlHt092~M2vKPRxAge~ehP5NJ0cy5~RokSaRBUGr~eVxus64GZU~vFXX3OXCCb~p2LP_MNOlh~pSgdr8bSLW~lhJLvPIIlV~RybylJRnhJ~T4PSuIdiwS~T53qL7THLZ~vSWEvTeZc6~iVTmZJMGJj~4i9bTBXFoE~rOnsP2Q5UN~r70NVOGJ5H~P5glpXg6VU~ie0shhAARr~JmNHQca4Km~ouiK2OKQ-A~K_WSdFXjg4~nrFOQYIh7z~7cMRrOPRjW~EWR7JDW6jH~tw8Zob-Izr~sAwDH104z0~Sbp1gmMeRy~JXmGXDx4tL~j3leh4reoN~C9_ZtBtMWU~pnCQRVigpy~dx7ljrJnxj~0roTzTKxJy~trfda46Fk8~yIg4ditfn_~XEuS3TPyCa~v3nOtgG77A~2XSW7Dtt5E~2EpPrOnc5S~QnLUXjsTk6~o2vM33GyaO~o3o9P--kXx~JL8rvDh62i~hQUvXSyZW-~LBMc5qP5TM~xlfjJKgpwx~zIv0cf5VVk~Z9XB6vYxvi~mIBxNOpKNs~B_OtVkMSZT~gmYaY_jsM2~CiSfl_AE0h~QEgdaUlAgu~JVA9YTPBgb; __utmb=235335808.3.10.1564650214" self.cookie = config.get('account', 'cookie', is_error=True) def init_pattern(self): self.illuster_id_from_user = re.compile( '<input name="id\[\]" value="(.*?)" type="checkbox"') self.illuster_profile_from_user = re.compile( 'data-profile_img="(.*?)"') self.illuster_username_from_user = re.compile( '"data-user_name="(.*?)"></a>') self.pagenum_4_show = re.compile( '<a href="\?type=user&rest=show&p=(.*?)">') self.pagenum_4_hide = re.compile( '<a href="\?type=user&rest=hide&p=(.*?)">') # 用于获得作品详情 self.get_illust_detail = re.compile( '}\)\((.*?)\);</script><link rel="apple-touch-icon"') def set_logger(self, logger): self.logger = logger def download_work_by_illust_id(self, save_dir, illust_id: str, use_databse=False): if use_databse: res = self.db_util.get_one( "select status from illust where illust_id = " + illust_id) if res is not None: if res[0] == WorkStatus.done.value: # 已被下载好 self.logger.info(" 本插画已被下载 illust_id 是 " + illust_id) return True # 获得插画信息 illust_info = self.get_img_info_by_img_id(illust_id) if illust_info is None: return (title, img_url, page_count, restrict, x_restrict, illust_type, illuster_id) = illust_info # 判断文件类型 if illust_type == IllustType.illust.value: self.download_illust(save_dir, illust_id, illuster_id, title, img_url, page_count, restrict, x_restrict) elif illust_type == IllustType.ugoira.value: # 新遇到的动图 self.logger.warning("遇到ugoira插画,id为{}".format(illust_id)) # return self.download_ugoira(save_dir, illust_id, illuster_id, title, img_url, restrict, x_restrict) elif illust_type == IllustType.manga.value: self.download_manga(illust_id, illuster_id, title, img_url, page_count, restrict, x_restrict) else: self.logger.info("暂不支持该类型的下载") def download_illust(self, save_dir, illust_id, illuster_id, title, img_url, page_count, restrict, x_restrict, use_database=False): res = True for i in range(page_count): try: temp = img_url.replace('_p0', '_p' + str(i)) # self.insert_illust(illust_id, IllustType.illust.value, title, img_url, # illuster_id, page_count, restrict, x_restrict) res = res and self.spider_util.download_img( temp, save_dir, header=self.headers) except Exception as e: res = False self.logger.error("下载失败" + repr(e) + "url是" + temp, exc_info=True) break if use_database: if res: self.insert_illust(illust_id, title, img_url, illuster_id, page_count, restrict, x_restrict, WorkStatus.done.value) else: # 数据库中保存下载失败的记录 self.insert_illust(illust_id, title, img_url, illuster_id, page_count, restrict, x_restrict, WorkStatus.failure.value) def download_illust_o(self, save_dir, illust_id, illuster_id, title, img_url, page_count, restrict, x_restrict): # temp = img_url.split("_p0") res = True for i in range(page_count): # img_url = "" try: img_url = img_url.replace('_p0', '_p' + str(i)) # img_url = temp[0] + "_p" + str(i) + temp[1] self.insert_illust(illust_id, IllustType.illust.value, title, img_url, illuster_id, page_count, restrict, x_restrict) res = res and self.spider_util.download_img( img_url, save_dir, header=self.headers) except Exception as e: res = False self.logger.error("下载失败" + repr(e) + "url是" + img_url, exc_info=True) break if res: self.db_util.update( 'update illust set status = %s where illust_id = %s', (WorkStatus.done.value, illust_id)) def filter_4_downloaded_work(self, illust_id_list): """查询数据库,看是否有对应的作品已经被下载完成了""" str_illust_id = ",".join( str(illust_id) for illust_id in illust_id_list) illust_sql = 'select illust_id from illust WHERE status = {} AND illust_id in ({})' \ .format(WorkStatus.done.value, str_illust_id) # ugoira_sql = 'select ugoira_id from ugoira WHERE status = {} AND ugoira_id in ({})' \ # .format(WorkStatus.done.value, str_illust_id) res_in_illust = self.db_util.get_all(illust_sql) # res_in_ugoira = self.db_util.get_all(ugoira_sql) downloaded_illust_ids = [i[0] for i in res_in_illust] # res_in_work.append([i[0] for i in res_in_ugoira]) for illust_id in downloaded_illust_ids: if str(illust_id) in illust_id_list: illust_id_list.remove(str(illust_id)) return illust_id_list # not_downloaded_work_ids = [] # for i in illust_id_list: # if int(i) not in downloaded_illust_ids: # not_downloaded_work_ids.append(i) # return not_downloaded_work_ids def get_postkey(self, url): """获取需要post的数据postkey""" pat = 'name="post_key" value="(.*?)"' # 不用组装headers也能拿到postKey,但是一定要是get,不能使用post content = self.session.get(url, timeout=15).text res_temp = re.findall(pat, content) if len(res_temp) != 1: self.logger.error("无法获得postKey", exc_info=True) return None postkey = re.findall(pat, content)[0] self.logger.info("your post key is " + postkey) return postkey def login(self): """模拟登陆""" try: postkey = self.get_postkey(self.GET_KEY_PAGE) if not postkey: return False post_data = \ { "pixiv_id": self.username, "password": self.password, # "captcha": "", # "g_recaptcha_response":"", "post_key": postkey, # "source":"pc", "ref": "wwwtop_accounts_index", "return_to": "https://www.pixiv.net/" } # 装个头,能解决一些问题 result = self.session.post(self.LOGIN_PAGE, data=post_data, headers=self.headers, cookies={"cookies": self.cookie}) pat = '"body":{"(.*?)"' is_login_flag = re.findall(pat, result.text)[0] if is_login_flag == 'success': self.logger.info("Log in successfully.Your username is " + self.username) return self.session else: self.logger.info("Login failed") return False except Exception as e: self.logger.error('连接无响应', exc_info=True) return False def get_illust_ids(self, illuster_id): """ 根据画师Id获得画师的插画IdList 插画包括动图和静态图 :param illuster_id: 画师id :return: """ try: # 获得所有的图片Id illust_id_json = self.get( self.URL_4_GET_ALL_WORK_ID.format(userId=illuster_id)).text illust_id_json = str2json(illust_id_json) ill_ids = illust_id_json["body"]["illusts"] # 示例:{[illust_id]:[illust_info],[illust_id]:[illust_info],[illust_id]:[illust_info]...} if ill_ids != []: return list(ill_ids.keys()) else: return [] except Exception as e: self.logger.error("搜索画师失败!" + repr(e) + "illuster_id是" + illuster_id, exc_info=True) return None def get_manga_ids(self, illuster_id): """ 根据画师Id获得画师的漫画Id :param illuster_id: :return: """ illust_id_json = self.get( self.URL_4_GET_ALL_WORK_ID.format( userId=illuster_id)).text # 获得所有的图片Id illust_id_json = str2json(illust_id_json) manga_ids = illust_id_json["body"]["manga"] return list(manga_ids.keys()) def get_img_info_by_img_id(self, illust_id): """ 根据插画或漫画的Id获得插画或者漫画的详细信息 :param img_id: :return: """ try: # info = self.session.get(self.URL_ILLUST_PAGE.format(illust_id=illust_id), timeout=self.timeout, # cookies={"cookies": self.cookie}).text info = self.get( self.URL_ILLUST_PAGE.format(illust_id=illust_id)).text illust_info_json = str2json(info) if self.isError(illust_info_json): self.logger.error( "找不到illust信息,illust_id is {},error message is {}".format( illust_id, illust_info_json['message']), exc_info=True) return None body = illust_info_json['body'] illust_type = body['illustType'] page_count = body['pageCount'] restrict = body['restrict'] x_restrict = body['xRestrict'] title = body['title'] url = body['urls']['original'] illuster_id = body['userId'] return title, url, page_count, restrict, x_restrict, illust_type, illuster_id except Exception as e: self.logger.error("获取插画信息失败" + repr(e) + "illust_id是" + illust_id, exc_info=True) return None # def is_illuster_exist(self, illuster_id): # try: # illust_id_json = self.get(self.URL_4_GET_ALL_WORK_ID.format(userId=illuster_id)).text # 获得所有的图片Id # illust_id_json = str2json(illust_id_json) # if self.isError(illust_id_json): # self.logger.error("画师不存在!illuster_id是{}".format(illuster_id), exc_info=True) # return False # else: # return True # except Exception as e: # self.logger.error("无法判断画师是否存在!illuster_id是{}".format(illuster_id), exc_info=True) # return None def get_concerned_illuster_info(self, type_, current_page_num): url = 'https://www.pixiv.net/bookmark.php?type=user&rest={}&p={}'.format( type_, current_page_num) html = self.get(url).text ids = self.illuster_id_from_user.findall(html) profiles = self.illuster_profile_from_user.findall(html) usernames = self.illuster_username_from_user.findall(html) return ids, profiles, usernames def get_show_pagenum(self, html): page = self.pagenum_4_show.findall(html) return len(page) def get_hide_pagenum(self, html): page = self.pagenum_4_hide.findall(html) return len(page) def get_concerned_illusters_pagenum(self, type_: str): """获得关注的画家的大致信息 type 为 hide或者show""" content = self.session.get( 'https://www.pixiv.net/bookmark.php?type=user&rest={}'.format( type_), cookies={ "cookies": self.cookie }).text page_num = 0 if type_ == 'hide': page_num = self.get_hide_pagenum(content) elif type_ == 'show': page_num = self.get_show_pagenum(content) return page_num def get_pagenum_hide(self, html): page = self.pagenum_4_hide.findall(html) return len(page) def get_ugoira_info(self, illust_id): # 插画id 下载illust信息 gif_info = str2json( self.session.get(self.UGOIRA_URL.format(illust_id=illust_id), cookies={ "cookies": self.cookie }).text) # print(gif_info) delays = [item["delay"] for item in gif_info["body"]["frames"]] frames = {f['file']: f['delay'] for f in gif_info["body"]['frames']} page_num = len(delays) zip_url = gif_info["body"]["originalSrc"] return frames, page_num, zip_url, delays def download_ugoira(self, save_dir, illust_id, illuster_id, title, url, restrict, x_restrict): try: frames, page_num, zip_url, delays = self.get_ugoira_info(illust_id) # self.insert_ugoira(illust_id, title, url, illuster_id, page_num=None, # restrict=restrict, x_restrict=x_restrict, status=WorkStatus.done.value) # page_num = self.spider_util.download_ugoira(illust_id, dest, self.session) gif_path = os.path.join(save_dir, illust_id + ".gif") zip_path = os.path.join(save_dir, illust_id + ".zip") res = self.spider_util.download_ugoira(zip_url, zip_path, self.headers) delays = [str(i) for i in delays] delays = ",".join(delays) if res: # 下载成功 self.insert_ugoira(illust_id, title, url, illuster_id, page_num=page_num, restrict=restrict, x_restrict=x_restrict, status=WorkStatus.done.value, delays=delays) else: self.insert_ugoira(illust_id, title, url, illuster_id, page_num=page_num, restrict=restrict, x_restrict=x_restrict, status=WorkStatus.failure.value, delays=delays) except Exception: self.logger.error('下载动图时,遇到问题', exc_info=True) def insert_illust(self, illust_id, title=None, url=None, illuster_id=None, page_num=None, restrict=None, x_restrict=None, status=None): select_sql = 'SELECT id FROM illust WHERE illust_id = %s ' res = self.db_util.get_one(select_sql, illust_id) loc_url = str(illuster_id) + "/" + url.split('/')[-1] if not res: # 数据库里没找到相关信息 insert_sql = 'INSERT INTO illust(title, url, illust_id, illuster_id, page_no, `type`, status,`restrict`,' \ 'x_restrict, loc_url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' self.db_util.insert(insert_sql, (title, url, illust_id, illuster_id, page_num, IllustType.illust.value, status, restrict, x_restrict, loc_url)) else: update_sql = 'UPDATE illust SET status = %s, page_no=%s, `restrict`=%s, x_restrict=%s, title=%s, ' \ 'loc_url=%s, type=%s WHERE illust_id=%s' self.db_util.update(update_sql, (status, page_num, restrict, x_restrict, title, loc_url, IllustType.illust.value, illust_id)) def insert_ugoira(self, illust_id, title, url, illuster_id, page_num, restrict=None, x_restrict=None, status=None, delays=None): select_sql = 'SELECT id FROM illust WHERE illust_id = %s AND type = {}'.format( IllustType.ugoira.value) res = self.db_util.get_one(select_sql, illust_id) loc_url = str(illuster_id) + "/" + url.split('/')[-1] if not res: # 数据库里没找到相关信息 insert_sql = "INSERT INTO ugoira( ugoira_id, delays) VALUES (%s, %s)" self.db_util.insert(insert_sql, (illust_id, delays)) insert_sql = 'INSERT INTO illust(title, url, illust_id, illuster_id, page_no, `type`, status,`restrict`,' \ 'x_restrict, loc_url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' self.db_util.insert(insert_sql, (title, url, illust_id, illuster_id, page_num, IllustType.ugoira.value, status, restrict, x_restrict, loc_url)) else: update_sql = 'UPDATE illust SET status = %s, page_no=%s, `restrict`=%s, x_restrict=%s, title=%s, ' \ 'loc_url=%s, type=%s WHERE illust_id=%s' self.db_util.update( update_sql, (WorkStatus.done.value, page_num, restrict, x_restrict, title, loc_url, status, illust_id)) def update_illust(self, illust_id, page_num, illust_status=WorkStatus.done.value): # select_sql = 'SELECT status FROM illust WHERE illust_id = %s ' update_sql = 'UPDATE illust SET status = %s , page_no=%s WHERE illust_id = %s' self.db_util.update(update_sql, (illust_status, page_num, illust_id)) # if self.db_util.get_one(select_sql, (illust_id)): # update_sql = 'UPDATE illust SET status = %s, page_no = %s WHERE illust_id = %s' # self.db_util.update(update_sql, (WorkStatus.done.value, page_num, illust_id)) # else: # self.logger.error("") # insert_sql = 'INSERT INTO illust(title, url, illust_id, illuster_id, page_no, `type`, status,`restrict`,' \ # 'x_restrict) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)' # self.db_util.insert(insert_sql, (title, url, illust_id, illuster_id, page_num, illust_type, # WorkStatus.waiting.value, restrict, x_restrict)) def isError(self, json): return json['error'] def download_manga(self, illust_id, illuster_id, title, img_url, page_count, restrict, x_restrict): raise NotImplementedError pass def get(self, url): return self.session.get(url, cookies={"cookies": self.cookie}, timeout=self.timeout) def get_illuster_info(self, illuster_id): # content = self.get(self.URL_4_GET_ILLUST_ID.format(userId=illuster_id)).text try: content = self.get( self.URL_4_GET_ILLUSTER_INFO.format(illuster_id)).text content_json = str2json(content) if self.isError(content_json): self.logger.error("画师不存在!illuster_id是{}".format(illuster_id), exc_info=True) return None name = content_json['body']['name'] img_url = content_json['body']['imageBig'] # print(content_json['extraData']['meta']['title']) # pat = re.compile('「(.*)」的个人资料 - pixiv') # # pat.search(content_json['extraData']['meta']['title']) # # pat = re.compile('「(.*)」的个人资料 - pixiv') # name = pat.findall(content_json['extraData']['meta']['title'])[0] return name, img_url except Exception: self.logger.error("无法判断画师是否存在!illuster_id是{}".format(illuster_id), exc_info=True) return None
def __init__(self): self.session = MysqlUtil().get_session()
class AuthorDAO(): def __init__(self): self.session = MysqlUtil().get_session() def get_authors(self): return self.session.query(Author).all() def get_author_by_name(self, auth_name): return self.session.query(Author).filter_by(name=auth_name).first() def get_author_by_id(self, auth_id): return self.session.query(Author).filter_by(id=auth_id).first() def add_author(self, auth_name): if self.get_author_by_name(auth_name) is None: auth = Author(name=auth_name) self.session.add(auth) self.session.commit() return True, u'添加成功' else: return False, u'作者已经存在!' def delete_author(self, auth_name): auth = self.get_author_by_name(auth_name) if auth is None: return False, u'作者不存在!' self.session.delete(auth) self.session.commit() return True, u'删除作者成功!' def update_author(self, auth_id, auth_name): auth = self.get_author_by_name(auth_name) if auth is not None: return False, u'作者名已经存在!' else: auth = self.get_author_by_id(auth_id) auth.name = auth_name self.session.commit() return True, u'修改成功!'
def main(): is_new = input('是否重新导出数据? yes/no:') version = time.time() if is_new == 'no': version = input('请输入历史版本 ps:1601260075.6850908:') if not os.path.exists('./.sql/dump/{0}'.format(version)): raise Exception('版本不存在') path_dump = './.sql/dump/{0}'.format(version) if not os.path.exists(path_dump): os.makedirs(path_dump) db_from_config = { 'ip': '', 'port': 3306, 'user': '', 'passwd': '', 'db': '' } db_to_config = {'ip': '', 'port': 3306, 'user': '', 'passwd': '', 'db': ''} if 'rds' in db_to_config['ip']: raise Exception('导入数据库存在敏感地址,请再次验证') all_tables = [] mysql_util = MysqlUtil(config=db_from_config) sql_all_tables = 'SHOW TABLES' sql_all_tables_result = mysql_util.query_sql(sql_all_tables) for table in list(sql_all_tables_result): all_tables.append(table[0]) if is_new == 'yes': for t in all_tables: print('----> from database <{0}>'.format(db_from_config['ip'])) print('----> dump table <{0}>'.format(t)) cmd_dump_sql = 'mysqldump --set-gtid-purged=off -h{h} -P{P} -u{u} -p{p} {db} --tables {tb}>{path_dump}/{tb}.sql'.format( h=db_from_config['ip'], P=db_from_config['port'], u=db_from_config['user'], p=db_from_config['passwd'], db=db_from_config['db'], path_dump=path_dump, tb=t) execute_cmd(cmd_dump_sql) verify_code = ''.join( random.sample([ 'z', 'y', 'x', 'w', 'v', 'u', 't', 's', 'r', 'q', 'p', 'o', 'n', 'm', 'l', 'k', 'j', 'i', 'h', 'g', 'f', 'e', 'd', 'c', 'b', 'a' ], 5)) print('----------WARNING START----------') print('----------infomation below must be readed carefully!----------') print( '----------or it will make a unexpectable and harmful issue----------') print('----------FROM DATABASE INFO----------') print('----------IP {0}----------'.format(db_from_config['ip'])) print('----------DATABASE {0}----------'.format(db_from_config['db'])) print('----------TO DATABASE INFO----------') print('----------IP {0}----------'.format(db_to_config['ip'])) print('----------DATABASE {0}----------'.format(db_to_config['db'])) print('----------WARNING END----------') print('----------VERIFY CODE: {0}------'.format(verify_code)) is_confirm_do = input('已阅读警告 yes/no:') if is_confirm_do == 'no': print('程序退出') return if is_confirm_do == 'yes': check_verify_code = input('输入上述验证码:') if check_verify_code != verify_code: raise Exception('验证码错误') else: for t in all_tables: print('----> to database <{0}>'.format(db_to_config['ip'])) print('----> import table <{0}>'.format(t)) cmd_import_sql = 'mysql -h{h} -P{P} -u{u} -p{p} {db}<{path_dump}/{tb}.sql'.format( h=db_to_config['ip'], P=db_to_config['port'], u=db_to_config['user'], p=db_to_config['passwd'], db=db_to_config['db'], path_dump=path_dump, tb=t) execute_cmd(cmd_import_sql)
# |\ __ \|\ \|\ \|\___ ___\\ __ \ # \ \ \|\ \ \ \\\ \|___ \ \_\ \ \|\ \ # \ \ __ \ \ \\\ \ \ \ \ \ \ \\\ \ # \ \ \ \ \ \ \\\ \ \ \ \ \ \ \\\ \ # \ \__\ \__\ \_______\ \ \__\ \ \_______\ # \|__|\|__|\|_______| \|__| \|_______| # ________ ________ ________ ___ __ ___ _______ ________ # |\ ____\|\ __ \|\ __ \|\ \ |\ \|\ \ |\ ___ \ |\ __ \ # \ \ \___|\ \ \|\ \ \ \|\ \ \ \ \ \ \ \ \ \ \ __/|\ \ \|\ \ # \ \ \ \ \ _ _\ \ __ \ \ \ __\ \ \ \ \ \ \ \_|/_\ \ _ _\ # \ \ \____\ \ \\ \\ \ \ \ \ \ \|\__\_\ \ \ \____\ \ \_|\ \ \ \\ \| # \ \_______\ \__\\ _\\ \__\ \__\ \____________\ \_______\ \_______\ \__\\ _\ # \|_______|\|__|\|__|\|__|\|__|\|____________|\|_______|\|_______|\|__|\|__| conn = MysqlUtil() #tags = conn.get_all('SELECT * FROM `test1.0`.`tag_Asyn`') ids = conn.get_all( 'SELECT `test1.0`.`book_Asyn`.`book_id` FROM `test1.0`.`book_Asyn` LEFT JOIN `test1.0`.`book_detail` ON `test1.0`.`book_Asyn`.`book_id` = `test1.0`.`book_detail`.`book_id` WHERE `book_introduct` IS NULL;' ) print(ids) #cids = conn.get_all(' SELECT `test1.0`.`book_Asyn`.`book_id` FROM `test1.0`.`book_Asyn` LEFT JOIN `commenttable_Asyn` cA on `book_Asyn`.`book_id` = cA.`book_id` WHERE `comment` IS NULL; ') runner = CrawlerRunner(get_project_settings()) @defer.inlineCallbacks def crawl(): while True: for tag in tags: print('*********************') print('\t\a' + tag[0])
#!/usr/bin/env python # coding=UTF-8 ''' # Desc: # Author:TavisD # Time:2016-10-10 10:53 # Ver:V1.0 ''' from utils.api_client import APIClient from utils.file_util import FileUtil from utils.mysql_util import MysqlUtil from utils.gen_util import GenUtil api_client = APIClient() file_util = FileUtil() mysql_util = MysqlUtil() gen_util = GenUtil()
class PixivSpider(): def load_config(self): # 加载配置文件 self.config = ConfigureUtil('config/config.conf') try: self.thread_num = self.config.get("app", "thread_num", type_="int") # 下载图片存储的根目录 self.root_path = self.config.get("download", "path") # 下载循环等待的时间 self.waiting_time = self.config.get("download", "waiting_time", type_="int") except Exception as e: self.logger.error("请检查你配置的下载路径====》{}".format(repr(e)), exc_info=True) raise e def load_logger(self): self.logger = Log(__name__).get_log() def __init__(self): # 加载配置文件 self.sql_4_update = 'update illuster set priority = 0, modify_time = "{}" WHERE illuster_id = {}' self.load_logger() self.load_config() self.pixiv_util = PixivUtil() self.db_util = DbUtil() self.pool = threadpool.ThreadPool(self.thread_num) self.WAITING_SQL = 'select illuster_id from illuster WHERE priority > 0 ' \ ' AND illuster_id!=11 ORDER BY priority DESC LIMIT {} ' ssl._create_default_https_context = ssl._create_unverified_context requests.packages.urllib3.disable_warnings(InsecureRequestWarning) # 测试。用来解决, 'ssl3_read_bytes', 'sslv3 alert bad record mac') requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL' def login(self): is_login = self.pixiv_util.login() if not is_login: self.logger.info("登陆失败,请检查账号密码是否正确和网络是否连通") return False else: return True def save_concerned_illuster_info(self, type_: str): """ 根据用户id找到用户关注的画师,保存到数据库 :param type_: hide或者show hide表示未公开,show表示已公开 :return: """ if not self.login(): return page_num = self.pixiv_util.get_concerned_illusters_pagenum(type_) for current_page_num in range(1, page_num + 1): ids_, profiles, usernames = self.pixiv_util.get_concerned_illuster_info( type_, current_page_num) for illuster_id, profile, username in zip(ids_, profiles, usernames): res = self.db_util.get_one( "select id from illuster where illuster_id = " + illuster_id) str_time = get_time() if res: self.logger.debug("该画师已存在" + illuster_id) self.db_util.update( 'UPDATE illuster SET name=%s, image_url=%s, modify_time=%s, priority=%s ' 'WHERE illuster_id =%s', (username, profile, str_time, 0, illuster_id)) else: self.db_util.insert( "insert into illuster(illuster_id,name,image_url,create_time,modify_time,priority) " "value(" + illuster_id + ",'" + username + "','" + profile + "','" + str_time + "','" + str_time + "',5)") self.logger.info("信息保存完毕") def get_illuster_ids(self, num, use_data_base=False): """ 获得num个未爬取的画家id 优先使用配置文件中的画家id,然后是数据库中待爬取的 """ ids = self.config.get("download", "illuster_ids") ids = [] if ids is None else ids.split() if len(ids) > num - 1: return ids[:10] elif use_data_base: illuster_ids = self.db_util.get_all(self.WAITING_SQL.format(10)) for illuster_id in illuster_ids: ids.append(str(illuster_id[0])) temp = list(set(ids))[:10] temp.sort(key=ids.index) return temp else: return ids[:10] def main(self, batch_size=10, use_database=False): """ 自动读取配置文件或者数据库,获得需要爬取的作家的id 配置文件要更加优先 找到画家id爬取画家的作品 """ while True: illuster_ids = self.get_illuster_ids(batch_size, use_database) if len(illuster_ids) == 0: time.sleep(60 * 60 * 4) for illuster_id in illuster_ids: self.logger.info("正在下载的是" + illuster_id + "的作品") if use_database: res = self.db_util.get_one( "select id from illuster where illuster_id = %s", illuster_id) # 爬取画师有关信息 str_time = get_time() # 先判断画师是否存在于pixiv网站 # is_exist = self.pixiv_util.is_illuster_exist(illuster_id) info = self.pixiv_util.get_illuster_info(illuster_id) if info is None: self.db_util.insert( "update illuster set priority = -1 WHERE illuster_id = %s", illuster_id) continue else: name, img_url = info if res: self.logger.debug("该画师已存在数据库中" + illuster_id) self.db_util.update( "update illuster set name = %s, image_url = %s, modify_time = %s where illuster_id = %s", (name, img_url, str_time, illuster_id)) else: self.db_util.insert( "insert into illuster(illuster_id, name, image_url, create_time,modify_time,priority)" " value(%s, %s, %s, %s, %s,5)", (illuster_id, name, img_url, str_time, str_time)) dir_ = os.path.join(self.root_path, illuster_id) make_directory(dir_) illust_ids = self.pixiv_util.get_illust_ids(illuster_id) if illuster_ids is None: self.logger.error( '获得插画列表失败,画师id为{}, 开始下一个画师的信息爬取'.format(illuster_id)) continue illust_num = len(illust_ids) self.logger.info("共有插图%d个" % (illust_num)) if use_database: if res: self.logger.debug("该画师已存在数据库中" + illuster_id) self.db_util.update( "update illuster set name = %s, image_url = %s, modify_time = %s, illust_num = %s where illuster_id = %s", (name, img_url, str_time, illust_num, illuster_id)) else: self.db_util.insert( "insert into illuster(illuster_id, name, image_url, create_time,modify_time,illust_num, priority)" " value(%s, %s, %s, %s, %s, %s, 5)", (illuster_id, name, img_url, str_time, str_time, illust_num)) if illust_num > 0: if use_database: illust_ids = self.pixiv_util.filter_4_downloaded_work( illust_ids) # 准备循环的数据 var_list = [] for illust_id in illust_ids: # 循环的到的illust_id var_list.append(([dir_, illust_id], None)) if self.pool is None: self.pool = threadpool.ThreadPool(self.thread_num) tasks = threadpool.makeRequests( self.pixiv_util.download_work_by_illust_id, var_list) [self.pool.putRequest(task) for task in tasks] self.pool.wait() # 直接删除第一个id,因为第一个id最先下载 il_ids = self.config.get("download", "illuster_ids") if il_ids is not None: self.config.update("download", "illuster_ids", " ".join(il_ids.split()[1:])) if use_database: self.db_util.update( self.sql_4_update.format(get_time(), illuster_id)) self.logger.info("{}的作品下载完成".format(illuster_id)) self.logger.info('one batch is over') time.sleep(self.waiting_time) self.logger.info("所有的作品下载完成") session.close()
def __init__(self, path): self.db_util = MysqlUtil() self.spdier_util = SpiderUtil() self.root_path = path self.logger = Log(__name__).get_log()
class Downloader: def __init__(self, path): self.db_util = MysqlUtil() self.spdier_util = SpiderUtil() self.root_path = path self.logger = Log(__name__).get_log() def get_info(self, select_sql): """ 根据sql查询要爬取的数据 """ res = self.db_util.get_all(select_sql) infos = [] for info in res: type_ = info[3] if type_ == IllustType.ugoira.value: # todo 只下载zip文件,之后再处理缩略图 url = Downloader.handle_url_4_ugoira(info[0]) else: url = str(info[0], encoding='utf-8') if '.gif' in url: # 暂不处理gif类型 continue if '.jpg' in url or '.png' in url: pass else: self.logger.warning('不支持的类型!illust_id为{}'.format(info[1])) infos.append((url, self.get_path(url, str(info[2])), info[4], info[1])) return infos def get_path(self, url: str, illuster_id: str): path = os.path.join(self.root_path, illuster_id) make_directory(path) return os.path.join(path, url.split("/")[-1]) @staticmethod def handle_url_4_ugoira(url): url = str(url, encoding="utf-8") i = url.replace('img-original', 'img-zip-ugoira') i = i.replace('0.jpg', '1920x1080.zip') i = i.replace('0.png', '1920x1080.zip') return i def download_file(self, url, save_path, headers): res = self.spdier_util.download_file(url, save_path, headers) if res == 404: self.logger.error('无法找到文件,url为{}'.format(url)) return 404 if res and valid_file(save_path, file_extension(save_path)): return True else: return False def download_file_list(self, url, save_path, headers, page_num: int, illust_id): """当一个illust_id下的所有插画都下载完成才更新状态""" res = True for i in range(page_num): url_temp = url.replace('_p0', '_p{}'.format(i)) save_path_temp = save_path.replace('_p0', '_p{}'.format(i)) res = res and self.download_file(url_temp, save_path_temp, headers) if res == 404: self.db_util.update( 'UPDATE illust SET status = {} WHERE illust_id = {}'. format(WorkStatus.unfound.value, illust_id)) return if res: self.db_util.update( 'UPDATE illust SET status = {} WHERE illust_id = {}'.format( WorkStatus.complete.value, illust_id)) def main(self, select_sql, headers, thread_num, interval): """ 查询url数据,然后自动下载 :param select_sql: 查询数据的sql语句 :param headers: 请求头设置 :param thread_num: 下载线程数 :param interval: 循环查询数据库的时间间隔 :return: """ pool = None while True: infos = self.get_info(select_sql) print('get info from database') if infos: param_list = [([i[0], i[1], headers, i[2], i[3]], None) for i in infos] if pool is None: pool = threadpool.ThreadPool(thread_num) tasks = threadpool.makeRequests(self.download_file_list, param_list) [pool.putRequest(task) for task in tasks] pool.wait() self.logger.info('one batch is over') print('one batch is over') time.sleep(interval)
class ArticleDao: def __init__(self): self.session = MysqlUtil().get_session() def get_articles(self, page=0, size=20): article_num = self.get_article_num() if page * size > article_num: return None, u'分页数超出最大值' info = u'more' if (page + 1) * size < article_num else u'nomore' return self.session.query(Article).order_by(desc( Article.modified_time)).offset(size * page).limit(size).all(), info def get_article_num(self): return self.session.query(Article).count() def get_article_by_title(self, title): return self.session.query(Article).filter_by(title=title).first() def get_article_by_id(self, id): return self.session.query(Article).filter_by(id=id).first() def new_article(self, title, author_id, author_name, cate_id, cate_name, intro, filepath, tags): article = Article(title=title, intro=intro, is_public=1, auth_id=author_id, auth_name=author_name, cate_id=cate_id, cate_name=cate_name, file_path=filepath, tags=tags, create_time=datetime.datetime.now(), modified_time=datetime.datetime.now()) if self.get_article_by_title(title) is None: self.session.add(article) self.session.commit() return True, u'成功添加新文章!' else: return False, u'文章标题已经存在!' def delete_article(self, article): self.session.delete(article) self.session.commit() return True, u'删除成功!' def get_articles_by_cate(self, cate, page=0, size=20): article_num = self.get_article_num() if page * size > article_num: return None, u'分页数超出最大值' info = u'more' if (page + 1) * size < article_num else u'nomore' return self.session.query(Article).filter_by(cate_id=cate).order_by( desc(Article.modified_time)).offset(size * page).limit(size).all(), info def get_article_by_tag(self, tag, page=0, size=20): article_num = self.get_article_num() if page * size > article_num: return None, u'分页数超出最大值' info = u'more' if (page + 1) * size < article_num else u'nomore' return self.session.query(Article)\ .filter(Article.tags.like(u'%{tag}%'.format(tag=tag)))\ .order_by(desc(Article.modified_time)).offset(size * page).limit(size).all(), info
class checker(): def __init__(self, root_path): self.root_path = root_path self.SELECT_SQL = "SELECT page_no, loc_url,status FROM illust WHERE illust_id={}" self.RESET_ILLUSTER_SQL = "UPDATE illust SET status = 1 WHERE illuster_id = {}" self.RESET_ILLUST_SQL = "UPDATE illust SET status = 1 WHERE illuster_id = {} AND illust_id NOT IN ({})" self.FILTER_SQL = "SELECT illust_id FROM illust WHERE status = 10 AND illust_id in ({})" # 设priority = 6代表已经检查完毕 self.MAKE_ILLUSTER_STATUS_DONE_SQL = "UPDATE illuster SET priority = 6 WHERE illuster_id = {}" self.CHECK_IF_DONE = 'SELECT priority FROM illuster WHERE illuster_id={}' self.GET_DONE_ILLUSTER = 'SELECT illuster_id FROM illuster WHERE priority=6' self.db_util = MysqlUtil() self.logger = Log(__name__, log_cate='checker').get_log() self.before_illuster_id = None def get_done_illuster(self): illuster_ids = self.db_util.get_all(self.GET_DONE_ILLUSTER) done_illuster_ids = [i[0] for i in illuster_ids] return done_illuster_ids def check_empty_dir(self, ignore=[]): try: ignore = ignore + self.get_done_illuster() for file_name in os.listdir(self.root_path): if file_name in ignore: continue print(file_name) # path 即 illuster_id if self.before_illuster_id is not None: # 将上一个设为完成 self.db_util.update( self.MAKE_ILLUSTER_STATUS_DONE_SQL.format( self.before_illuster_id)) illuster_id = file_name self.before_illuster_id = illuster_id # 如果当前的illuster已经处理过了,就跳到下一个 if self.db_util.get_one( self.CHECK_IF_DONE.format(illuster_id))[0] == 6: continue path = os.path.join(self.root_path, file_name) if os.path.isdir(path): image_files = os.listdir(path) if len(image_files) == 0: print("no images in {}".format(path)) self.db_util.update( self.RESET_ILLUSTER_SQL.format(illuster_id)) continue except Exception as e: self.logger.error('some problem happen', exc_info=1) raise e def check(self, ignore=[]): try: ignore = ignore + self.get_done_illuster() for file_name in os.listdir(self.root_path): if file_name in ignore: continue print(file_name) # path 即 illuster_id if self.before_illuster_id is not None: # 将上一个设为完成 self.db_util.update( self.MAKE_ILLUSTER_STATUS_DONE_SQL.format( self.before_illuster_id)) illuster_id = file_name self.before_illuster_id = illuster_id if self.db_util.get_one( self.CHECK_IF_DONE.format(illuster_id))[0] == 6: continue path = os.path.join(self.root_path, file_name) if os.path.isdir(path): image_files = os.listdir(path) if len(image_files) == 0: print("no images in {}".format(path)) self.db_util.update( self.RESET_ILLUSTER_SQL.format(illuster_id)) continue illust_ids = [] for file_ in image_files: if file_.endswith('.zip'): illust_id = file_.replace('.zip', '') elif file_.endswith('.gif'): # todo 暂不处理gif continue else: illust_id = file_.split("_p")[0] if illust_id not in illust_ids: illust_ids.append(illust_id) complete_illust_ids = [] # if not illust_ids: # continue # illust_id_list = self.db_util.get_all(self.FILTER_SQL.format(','.join(illust_ids))) # illust_ids = [i[0] for i in illust_id_list] # if illust_ids: # continue for illust_id in illust_ids: res = True info = self.db_util.get_one( self.SELECT_SQL.format(illust_id)) # 如果是数据库中没有信息 if not info: self.logger.info( '数据库中没有信息,illust_id为{}'.format(illust_id)) print(illust_id) continue (page_no, loc_url, status) = info if status < WorkStatus.done.value: # 10是下载成功状态,小于10 表示不成功 continue loc_url = str(loc_url, encoding='utf-8') if 'gif' in loc_url: # 暂时不处理gif,理论上。gif文件不会进入这里来 complete_illust_ids.append(illust_id) continue elif 'ugoira' in loc_url: loc_url = loc_url.replace('_ugoira0.jpg', '.zip') loc_url_temp = loc_url.replace( '_ugoira0.png', '.zip') if valid_file( os.path.join(self.root_path, loc_url_temp)): complete_illust_ids.append(illust_id) continue for i in range(page_no): loc_url_temp = loc_url.replace( "_p0", "_p{}".format(i)) res = res and valid_file( os.path.join(self.root_path, loc_url_temp)) # 理论上只有完整的jpg,png到这边来 if res: complete_illust_ids.append(illust_id) if not complete_illust_ids: # 全部设为未完成 self.db_util.update( self.RESET_ILLUSTER_SQL.format(illuster_id)) # self.db_util.update(self.RESET_ILLUST_SQL.format(",".join(incomplete_illust_ids))) else: self.db_util.update( self.RESET_ILLUST_SQL.format( illuster_id, ','.join(complete_illust_ids))) print(complete_illust_ids) # 把最后一个illuster_id进行处理 if self.before_illuster_id is not None: self.db_util.update( self.MAKE_ILLUSTER_STATUS_DONE_SQL.format( self.before_illuster_id)) except Exception as e: self.logger.error('some problem happen', exc_info=1) raise e
class CategoryDAO: def __init__(self): self.session = MysqlUtil().get_session() def add_category(self, cate_name): if self.get_cate_by_name(cate_name) is None: cate = Category(cate_name=cate_name) self.session.add(cate) self.session.commit() return True, u'成功添加新分类' else: return False, u'类目已经存在' def get_cate_by_name(self, cate_name): cate = self.session.query(Category).filter_by( cate_name=cate_name).first() if cate is not None: return cate else: return None def get_cate_by_id(self, cate_id): cate = self.session.query(Category).filter_by(id=cate_id).first() if cate is not None: return cate else: return None def get_categories(self): return self.session.query(Category).all() def delete_category(self, cate_id): cate = self.get_cate_by_id(cate_id) if cate is not None: self.session.delete(cate) self.session.commit() return True, u'删除成功' else: return False, u'删除失败,删除的类目不存在!' def update_category(self, cate_id, cate_name): cate = self.get_cate_by_id(cate_id) if cate is not None: if self.get_cate_by_id(cate_name) is None: cate.cate_name = cate_name cate.modify_time = datetime.datetime.utcnow() self.session.commit() return True, u'修改成功!' else: return False, u'修改的类目名称已经存在!' else: return False, u'指定ID的类目不存在!'
class GenFile(): def __init__(self): self.path_tpl_entity = PathUtil.root('templates') + '/entity.tpl' self.path_tpl_mapper = PathUtil.root('templates') + '/mapper.tpl' self.project_java = 'test' self.mysql_util = MysqlUtil({ 'ip': 'localhost', 'user': '******', 'passwd': '123456', 'db': 'tacomall' }) self.all_tables = [] self._query_all_tables() def _query_all_tables(self): sql_all_tables = ''' SELECT table_name FROM information_schema.`TABLES` WHERE TABLE_SCHEMA = 'tacomall'; ''' result_sql_all_tables = self.mysql_util.query_sql(sql_all_tables) self.all_tables = list(map(lambda i: i[0], result_sql_all_tables)) def _load_tpl(self, tpl_type=1): path = '' if tpl_type == 1: path = self.path_tpl_entity if tpl_type == 2: path = self.path_tpl_mapper tpl = '' with open(path) as f_tpl: tpl = f_tpl.read() f_tpl.close() return tpl def _2camel(self, s, capitalize=False): s = s.lower() class_name = '' for i, v in enumerate(s.split('_')): if i == 0 and not capitalize: class_name = v continue class_name = class_name + v.capitalize() return class_name def _gen_entity(self): tpl = self._load_tpl(1) for table in self.all_tables: new_tpl = '' spl_table = table.split('_') package = 'store.tacomall.db.entity.{0}'.format(spl_table[0]) class_name = self._2camel(table, capitalize=True) fields_content = '' ignore_columns = [ 'id', 'is_delete', 'create_time', 'update_time', 'delete_time' ] filename_gen = PathUtil.root(self.project_java + '/entity/{0}/'.format(spl_table[0]) ) + '{0}.java'.format(class_name) sql_table_structure = ''' SELECT DATA_TYPE, column_name, column_comment FROM information_schema.COLUMNS WHERE table_name = '{0}'; '''.format(table) if PathUtil.is_exists(filename_gen): print('实体类{0}.java文件已存在'.format(class_name)) continue result_sql_table_structure = self.mysql_util.query_sql( sql_table_structure) for structure in list( map( lambda i: { 'type': i[0], 'column': i[1], 'comment': i[2] }, result_sql_table_structure)): if structure['column'] in ignore_columns: continue java_data_type = '' if structure['type'] == 'int': java_data_type = 'int' if structure['type'] == 'varchar': java_data_type = 'String' if structure['type'] == 'text': java_data_type = 'String' if structure['type'] == 'datetime': java_data_type = 'Date' fields_content = fields_content + \ ' private {0} {1}; // {2}\n\n'.format( java_data_type, self._2camel(structure['column']), structure['comment']) new_tpl = tpl.replace('${PACKAGE}', package) new_tpl = new_tpl.replace('${CLASS}', class_name) new_tpl = new_tpl.replace('${FIELDS}', fields_content) with open(filename_gen, 'w+', encoding='utf8') as f_gen: f_gen.write(new_tpl) f_gen.close() def _gen_mapper(self): tpl = self._load_tpl(2) for table in self.all_tables: new_tpl = '' spl_table = table.split('_') package = 'store.tacomall.db.mapper.{0}'.format(spl_table[0]) class_name = self._2camel(table, capitalize=True) + 'Mapper' filename_gen = PathUtil.root(self.project_java + '/mappper/{0}/'.format(spl_table[0]) ) + '{0}.java'.format(class_name) if PathUtil.is_exists(filename_gen): print('mapper类{0}.java文件已存在'.format(class_name)) continue new_tpl = tpl.replace('${PACKAGE}', package) new_tpl = new_tpl.replace('${CLASS}', class_name) new_tpl = new_tpl.replace('${ENTITY_DOMAIN}', spl_table[0]) new_tpl = new_tpl.replace('${ENTITY_CLASS}', self._2camel(table, capitalize=True)) with open(filename_gen, 'w+', encoding='utf8') as f_gen: f_gen.write(new_tpl) f_gen.close() def run(self): self._gen_entity() self._gen_mapper()