def __init__(self):
     self.path_tpl_entity = PathUtil.root('templates') + '/entity.tpl'
     self.path_tpl_mapper = PathUtil.root('templates') + '/mapper.tpl'
     self.project_java = 'test'
     self.mysql_util = MysqlUtil({
         'ip': 'localhost',
         'user': '******',
         'passwd': '123456',
         'db': 'tacomall'
     })
     self.all_tables = []
     self._query_all_tables()
Beispiel #2
0
 def __init__(self, root_path):
     self.root_path = root_path
     self.SELECT_SQL = "SELECT page_no, loc_url,status FROM illust WHERE illust_id={}"
     self.RESET_ILLUSTER_SQL = "UPDATE illust SET status = 1 WHERE illuster_id = {}"
     self.RESET_ILLUST_SQL = "UPDATE illust SET status = 1 WHERE illuster_id = {} AND illust_id NOT IN ({})"
     self.FILTER_SQL = "SELECT illust_id FROM illust WHERE status = 10 AND illust_id in ({})"
     # 设priority = 6代表已经检查完毕
     self.MAKE_ILLUSTER_STATUS_DONE_SQL = "UPDATE illuster SET priority = 6 WHERE illuster_id = {}"
     self.CHECK_IF_DONE = 'SELECT priority FROM illuster WHERE illuster_id={}'
     self.GET_DONE_ILLUSTER = 'SELECT illuster_id FROM illuster WHERE priority=6'
     self.db_util = MysqlUtil()
     self.logger = Log(__name__, log_cate='checker').get_log()
     self.before_illuster_id = None
Beispiel #3
0
class CommentDAO:
    def __init__(self):
        self.session = MysqlUtil().get_session()

    def new_comment(self, art_id, content):
        if ArticleDao().get_article_by_id(art_id) is None:
            return False, u'评论的文章不存在'
        else:
            comment = Comment()
            comment.article_id = art_id
            comment.comment = content
            comment.user = current_user.username
            self.session.add(comment)
            self.session.commit()
            return True, u'评论成功!'

    def get_article_comments_num(self, art_id):
        return self.session.query(Comment) \
            .filter_by(article_id=art_id).count()

    def get_comments_by_article(self, article_id, page=0, size=20):
        article_num = self.get_article_comments_num(article_id)
        if page * size > article_num:
            return None, u'分页数超出最大值'
        info = u'more' if (page + 1) * size < article_num else u'nomore'
        return self.session.query(Comment)\
                   .filter_by(article_id=article_id)\
                   .order_by(desc(Comment.modified_time)).offset(size * page).limit(size).all(), info
Beispiel #4
0
    def __init__(self):
        # 加载配置文件
        self.sql_4_update = 'update illuster set priority = 0,  modify_time = "{}" WHERE illuster_id  = {}'
        self.load_logger()
        self.load_config()
        self.pixiv_util = PixivUtil()
        self.db_util = DbUtil()
        self.pool = threadpool.ThreadPool(self.thread_num)
        self.WAITING_SQL = 'select illuster_id from illuster WHERE priority > 0 ' \
                           ' AND illuster_id!=11 ORDER BY priority DESC LIMIT {} '
        ssl._create_default_https_context = ssl._create_unverified_context
        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

        #  测试。用来解决, 'ssl3_read_bytes', 'sslv3 alert bad record mac')
        requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL'
Beispiel #5
0
    def __init__(self):
        self.init_pattern()
        self.load_config()

        self.logger = Log(__name__).get_log()
        self.GET_KEY_PAGE = "https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index"
        self.LOGIN_PAGE = "https://accounts.pixiv.net/api/login?lang=zh"  # 登陆页面

        self.UGOIRA_URL = 'https://www.pixiv.net/ajax/illust/{illust_id}/ugoira_meta'  # 动图url,0填写动图的id
        self.URL_4_GET_ALL_WORK_ID = "https://www.pixiv.net/ajax/user/{userId}/profile/all"  # 画师的所有作品信息
        self.URL_4_ILLUSTER_MAIN_PAGE = "https://www.pixiv.net/ajax/user/{}/profile/top"  # 画师主页24个作品信息,少量画师信息
        self.URL_4_GET_ILLUSTER_INFO = "https://www.pixiv.net/ajax/user/{}?full=1"  # 用于获得画师信息
        self.URL_ILLUST_PAGE = "https://www.pixiv.net/member_illust.php?mode=medium&illust_id={illust_id}"  # 插画页面
        self.URL_ILLUST_PAGE = "https://www.pixiv.net/ajax/illust/{illust_id}"  # 插画页面

        self.REFERER = "https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index"
        self.USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"

        self.headers = {'Referer': self.REFERER, 'User-Agent': self.USER_AGENT}
        # 方法二:设置请求失败之后重复请求次数
        # requests.adapters.DEFAULT_RETRIES = self.max_retries

        self.session = requests.Session()
        if self.cookie is not None:
            self.session.cookies.set("authentication", self.cookie)
        if self.proxies['http'] is not None or self.proxies[
                'https'] is not None:
            self.session.proxies = self.proxies  # 设置session默认代理

        # 方法1:设置请求失败之后重复请求次数
        request_retry = requests.adapters.HTTPAdapter(
            max_retries=self.max_retries)
        self.session.mount('https://', request_retry)
        self.session.mount('http://', request_retry)

        # self.session.keep_alive = False

        self.spider_util = SpiderUtil()
        #  操作数据库
        self.db_util = DbUtil()
        self.sql_4_insert = 'insert into illust(title,url,illust_id,illuster_id,page_no,status,`restrict`,x_restrict)' \
                            'values ( %s, %s, %s, %s, %s, 0, %s, %s)'
        self.sql_4_insert_4_ugoira = 'insert  into illust(title,url,illust_id,illuster_id,page_no,status)values( %s, %s, %s, %s, %s,444)'
        self.sql_4_insert_2_done = 'insert  into illust(title,url,illust_id,illuster_id,page_no,status)values( %s, %s, %s, %s, 0, 10)'
Beispiel #6
0
def init_db():
    conf = ConfigUtil()
    host = conf.get_config(conf='host', section='db_info')
    port = conf.get_config(conf='port', section='db_info')
    username = conf.get_config(conf='username', section='db_info')
    password = conf.get_config(conf='password', section='db_info')
    database = conf.get_config(conf='database', section='db_info')

    conn = MySQLdb.connect(host=host,
                           user=username,
                           passwd=password,
                           port=int(port))
    cur = conn.cursor()
    sql = 'CREATE DATABASE IF NOT EXISTS `{db}` DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;'.format(
        db=database)

    cur.execute(sql)
    conn.commit()
    cur.close()
    conn.close()

    session = MysqlUtil().get_session()
    BaseModel.metadata.create_all(MysqlUtil().get_engine())

    email = conf.get_config('email', 'admin_account')
    if session.query(User).filter_by(email=email).first() is None:
        password = conf.get_config('password', 'admin_account')
        username = conf.get_config('username', 'admin_account')

        user = User(username=username,
                    password=generate_password_hash(password),
                    email=email,
                    status=1,
                    is_admin=1,
                    register_time=datetime.datetime.now(),
                    last_login_time=datetime.datetime.now())
        session.add(user)
        session.commit()
Beispiel #7
0
 def __init__(self):
     user_dao = MysqlUtil()
     self.session = user_dao.get_session()
Beispiel #8
0
class PixivUtil:
    def __init__(self):
        self.init_pattern()
        self.load_config()

        self.logger = Log(__name__).get_log()
        self.GET_KEY_PAGE = "https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index"
        self.LOGIN_PAGE = "https://accounts.pixiv.net/api/login?lang=zh"  # 登陆页面

        self.UGOIRA_URL = 'https://www.pixiv.net/ajax/illust/{illust_id}/ugoira_meta'  # 动图url,0填写动图的id
        self.URL_4_GET_ALL_WORK_ID = "https://www.pixiv.net/ajax/user/{userId}/profile/all"  # 画师的所有作品信息
        self.URL_4_ILLUSTER_MAIN_PAGE = "https://www.pixiv.net/ajax/user/{}/profile/top"  # 画师主页24个作品信息,少量画师信息
        self.URL_4_GET_ILLUSTER_INFO = "https://www.pixiv.net/ajax/user/{}?full=1"  # 用于获得画师信息
        self.URL_ILLUST_PAGE = "https://www.pixiv.net/member_illust.php?mode=medium&illust_id={illust_id}"  # 插画页面
        self.URL_ILLUST_PAGE = "https://www.pixiv.net/ajax/illust/{illust_id}"  # 插画页面

        self.REFERER = "https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index"
        self.USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"

        self.headers = {'Referer': self.REFERER, 'User-Agent': self.USER_AGENT}
        # 方法二:设置请求失败之后重复请求次数
        # requests.adapters.DEFAULT_RETRIES = self.max_retries

        self.session = requests.Session()
        if self.cookie is not None:
            self.session.cookies.set("authentication", self.cookie)
        if self.proxies['http'] is not None or self.proxies[
                'https'] is not None:
            self.session.proxies = self.proxies  # 设置session默认代理

        # 方法1:设置请求失败之后重复请求次数
        request_retry = requests.adapters.HTTPAdapter(
            max_retries=self.max_retries)
        self.session.mount('https://', request_retry)
        self.session.mount('http://', request_retry)

        # self.session.keep_alive = False

        self.spider_util = SpiderUtil()
        #  操作数据库
        self.db_util = DbUtil()
        self.sql_4_insert = 'insert into illust(title,url,illust_id,illuster_id,page_no,status,`restrict`,x_restrict)' \
                            'values ( %s, %s, %s, %s, %s, 0, %s, %s)'
        self.sql_4_insert_4_ugoira = 'insert  into illust(title,url,illust_id,illuster_id,page_no,status)values( %s, %s, %s, %s, %s,444)'
        self.sql_4_insert_2_done = 'insert  into illust(title,url,illust_id,illuster_id,page_no,status)values( %s, %s, %s, %s, 0, 10)'

    def get_session(self):
        if self.login():
            return self.session
        else:
            self.logger.error("登陆失败,请检查账号密码是否正确和网络是否连通")
            return 'error'

    def load_config(self):
        config = ConfigureUtil('config/config.conf')
        http_proxy = config.get('proxy', 'http', is_error=True)
        https_proxy = config.get('proxy', 'https', is_error=True)
        self.proxies = {'http': http_proxy, 'https': https_proxy}
        # 重试次数必须是数值类型
        self.max_retries = config.get('app', 'max_retries', 'int')
        #  超时时间, 单位是秒
        self.timeout = config.get('app',
                                  'time_out',
                                  type_="int",
                                  is_error=True,
                                  default=20)
        self.username = config.get('account', 'username')
        self.password = config.get('account', 'password')
        # self.cookie = "p_ab_id=0; p_ab_id_2=3; login_ever=yes; a_type=0; b_type=1; first_visit_datetime_pc=2018-06-06+10%3A53%3A26; _ga=GA1.2.1555034815.1497772914; p_ab_d_id=1974173592; yuid_b=NXcmaYM; module_orders_mypage=%5B%7B%22name%22%3A%22sketch_live%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22following_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22tag_follow%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22recommended_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22everyone_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22mypixiv_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22fanbox%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22featured_tags%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22contests%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22user_events%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22sensei_courses%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22spotlight%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22booth_follow_items%22%2C%22visible%22%3Atrue%7D%5D; ki_r=; __utmc=235335808; OX_plg=pm; c_type=26; __utmv=235335808.|2=login%20ever=yes=1^3=plan=normal=1^5=gender=male=1^6=user_id=27971320=1^9=p_ab_id=0=1^10=p_ab_id_2=3=1^11=lang=zh=1; gsScrollPos-422=; _td=5e2a3ce8-e5ca-4e1c-9bc5-4188deff8d9d; ki_s=197685%3A0.0.0.0.0; gsScrollPos-73=; gsScrollPos-74=0; __utmz=235335808.1558931621.48.3.utmcsr=accounts.pixiv.net|utmccn=(referral)|utmcmd=referral|utmcct=/login; limited_ads=%7B%22responsive%22%3A%22%22%7D; PHPSESSID=27971320_a3ba646dd6d6fe402a566008ca109afe; privacy_policy_agreement=1; ki_t=1539786814040%3B1564299004430%3B1564299004430%3B7%3B11; categorized_tags=0Gd_u69FDa~0VZuk18GJB~0roTzTKxJy~7cMRrOPRjW~BU9SQkS-zU~EQHKvBDRBz~IVwLyT8B6k~Ig5OcZugU6~OEXgaiEbRa~OT-C6ubi9i~RcahSSzeRf~RkTaP3d-E6~_-agXPKuAQ~b8b4-hqot7~cpt_Nk5mjc~jYnWl04aAC~l2rugVKl6u~r70NVOGJ5H~xlfjJKgpwx~y8GNntYHsi; __utmt=1; __utma=235335808.1555034815.1497772914.1564650150.1564650214.63; tag_view_ranking=0xsDLqCEW6~Ie2c51_4Sp~BU9SQkS-zU~y8GNntYHsi~RTJMXD26Ak~AI_aJCDFn0~RcahSSzeRf~i83OPEGrYw~8HRshblb4Q~4-_9de7LBH~cpt_Nk5mjc~q3eUobDMJW~skx_-I2o4Y~faHcYIP1U0~404yEt28rv~fFjokb4ZCF~-YeeMY1Yjs~FqVQndhufZ~tgP8r-gOe_~0HA6x-6rNd~Ow9mLSvmxK~KN7uxuR89w~5oPIfUbtd6~NpsIVvS-GF~gooMLQqB9a~Lt-oEicbBr~gpglyfLkWs~jYnWl04aAC~VIOKa7rioU~HBlflqJjBZ~zyKU3Q5L4C~TWrozby2UO~BB4jge2y2O~nyZIqZI1jx~MM6RXH_rlN~3gc3uGrU1V~LJo91uBPz4~laE3IylUE6~_hSAdpN9rx~n7YxiukgPF~l2rugVKl6u~A0c1GtjhvT~KhVXu5CuKx~WlKkwEuUi0~VbPCYJXdEP~2-RXlHt092~M2vKPRxAge~ehP5NJ0cy5~RokSaRBUGr~eVxus64GZU~vFXX3OXCCb~p2LP_MNOlh~pSgdr8bSLW~lhJLvPIIlV~RybylJRnhJ~T4PSuIdiwS~T53qL7THLZ~vSWEvTeZc6~iVTmZJMGJj~4i9bTBXFoE~rOnsP2Q5UN~r70NVOGJ5H~P5glpXg6VU~ie0shhAARr~JmNHQca4Km~ouiK2OKQ-A~K_WSdFXjg4~nrFOQYIh7z~7cMRrOPRjW~EWR7JDW6jH~tw8Zob-Izr~sAwDH104z0~Sbp1gmMeRy~JXmGXDx4tL~j3leh4reoN~C9_ZtBtMWU~pnCQRVigpy~dx7ljrJnxj~0roTzTKxJy~trfda46Fk8~yIg4ditfn_~XEuS3TPyCa~v3nOtgG77A~2XSW7Dtt5E~2EpPrOnc5S~QnLUXjsTk6~o2vM33GyaO~o3o9P--kXx~JL8rvDh62i~hQUvXSyZW-~LBMc5qP5TM~xlfjJKgpwx~zIv0cf5VVk~Z9XB6vYxvi~mIBxNOpKNs~B_OtVkMSZT~gmYaY_jsM2~CiSfl_AE0h~QEgdaUlAgu~JVA9YTPBgb; __utmb=235335808.3.10.1564650214"
        self.cookie = config.get('account', 'cookie', is_error=True)

    def init_pattern(self):
        self.illuster_id_from_user = re.compile(
            '<input name="id\[\]" value="(.*?)" type="checkbox"')
        self.illuster_profile_from_user = re.compile(
            'data-profile_img="(.*?)"')
        self.illuster_username_from_user = re.compile(
            '"data-user_name="(.*?)"></a>')
        self.pagenum_4_show = re.compile(
            '<a href="\?type=user&amp;rest=show&amp;p=(.*?)">')
        self.pagenum_4_hide = re.compile(
            '<a href="\?type=user&amp;rest=hide&amp;p=(.*?)">')
        # 用于获得作品详情
        self.get_illust_detail = re.compile(
            '}\)\((.*?)\);</script><link rel="apple-touch-icon"')

    def set_logger(self, logger):
        self.logger = logger

    def download_work_by_illust_id(self,
                                   save_dir,
                                   illust_id: str,
                                   use_databse=False):
        if use_databse:
            res = self.db_util.get_one(
                "select status from illust where illust_id = " + illust_id)
            if res is not None:
                if res[0] == WorkStatus.done.value:  # 已被下载好
                    self.logger.info(" 本插画已被下载 illust_id 是 " + illust_id)
                    return True
        # 获得插画信息
        illust_info = self.get_img_info_by_img_id(illust_id)
        if illust_info is None:
            return
        (title, img_url, page_count, restrict, x_restrict, illust_type,
         illuster_id) = illust_info
        # 判断文件类型
        if illust_type == IllustType.illust.value:
            self.download_illust(save_dir, illust_id, illuster_id, title,
                                 img_url, page_count, restrict, x_restrict)
        elif illust_type == IllustType.ugoira.value:  # 新遇到的动图
            self.logger.warning("遇到ugoira插画,id为{}".format(illust_id))
            # return
            self.download_ugoira(save_dir, illust_id, illuster_id, title,
                                 img_url, restrict, x_restrict)
        elif illust_type == IllustType.manga.value:
            self.download_manga(illust_id, illuster_id, title, img_url,
                                page_count, restrict, x_restrict)
        else:
            self.logger.info("暂不支持该类型的下载")

    def download_illust(self,
                        save_dir,
                        illust_id,
                        illuster_id,
                        title,
                        img_url,
                        page_count,
                        restrict,
                        x_restrict,
                        use_database=False):
        res = True
        for i in range(page_count):
            try:
                temp = img_url.replace('_p0', '_p' + str(i))
                # self.insert_illust(illust_id, IllustType.illust.value, title, img_url,
                #                    illuster_id, page_count, restrict, x_restrict)
                res = res and self.spider_util.download_img(
                    temp, save_dir, header=self.headers)
            except Exception as e:
                res = False
                self.logger.error("下载失败" + repr(e) + "url是" + temp,
                                  exc_info=True)
                break
        if use_database:
            if res:
                self.insert_illust(illust_id, title, img_url, illuster_id,
                                   page_count, restrict, x_restrict,
                                   WorkStatus.done.value)
            else:
                # 数据库中保存下载失败的记录
                self.insert_illust(illust_id, title, img_url, illuster_id,
                                   page_count, restrict, x_restrict,
                                   WorkStatus.failure.value)

    def download_illust_o(self, save_dir, illust_id, illuster_id, title,
                          img_url, page_count, restrict, x_restrict):
        # temp = img_url.split("_p0")
        res = True
        for i in range(page_count):
            # img_url = ""
            try:
                img_url = img_url.replace('_p0', '_p' + str(i))
                # img_url = temp[0] + "_p" + str(i) + temp[1]
                self.insert_illust(illust_id, IllustType.illust.value, title,
                                   img_url, illuster_id, page_count, restrict,
                                   x_restrict)
                res = res and self.spider_util.download_img(
                    img_url, save_dir, header=self.headers)
            except Exception as e:
                res = False
                self.logger.error("下载失败" + repr(e) + "url是" + img_url,
                                  exc_info=True)
                break
        if res:
            self.db_util.update(
                'update illust set status = %s where illust_id = %s',
                (WorkStatus.done.value, illust_id))

    def filter_4_downloaded_work(self, illust_id_list):
        """查询数据库,看是否有对应的作品已经被下载完成了"""
        str_illust_id = ",".join(
            str(illust_id) for illust_id in illust_id_list)
        illust_sql = 'select illust_id from illust WHERE status = {} AND illust_id in ({})' \
            .format(WorkStatus.done.value, str_illust_id)
        # ugoira_sql = 'select ugoira_id from ugoira WHERE status = {} AND ugoira_id in ({})' \
        #     .format(WorkStatus.done.value, str_illust_id)
        res_in_illust = self.db_util.get_all(illust_sql)
        # res_in_ugoira = self.db_util.get_all(ugoira_sql)
        downloaded_illust_ids = [i[0] for i in res_in_illust]
        # res_in_work.append([i[0] for i in res_in_ugoira])
        for illust_id in downloaded_illust_ids:
            if str(illust_id) in illust_id_list:
                illust_id_list.remove(str(illust_id))
        return illust_id_list
        # not_downloaded_work_ids = []
        # for i in illust_id_list:
        #     if int(i) not in downloaded_illust_ids:
        #         not_downloaded_work_ids.append(i)
        # return not_downloaded_work_ids

    def get_postkey(self, url):
        """获取需要post的数据postkey"""
        pat = 'name="post_key" value="(.*?)"'
        # 不用组装headers也能拿到postKey,但是一定要是get,不能使用post
        content = self.session.get(url, timeout=15).text
        res_temp = re.findall(pat, content)
        if len(res_temp) != 1:
            self.logger.error("无法获得postKey", exc_info=True)
            return None
        postkey = re.findall(pat, content)[0]
        self.logger.info("your post key is " + postkey)
        return postkey

    def login(self):
        """模拟登陆"""
        try:
            postkey = self.get_postkey(self.GET_KEY_PAGE)
            if not postkey:
                return False
            post_data = \
                {
                    "pixiv_id": self.username,
                    "password": self.password,
                    # "captcha": "",
                    # "g_recaptcha_response":"",
                    "post_key": postkey,
                    # "source":"pc",
                    "ref": "wwwtop_accounts_index",
                    "return_to": "https://www.pixiv.net/"
                }
            # 装个头,能解决一些问题
            result = self.session.post(self.LOGIN_PAGE,
                                       data=post_data,
                                       headers=self.headers,
                                       cookies={"cookies": self.cookie})
            pat = '"body":{"(.*?)"'
            is_login_flag = re.findall(pat, result.text)[0]
            if is_login_flag == 'success':
                self.logger.info("Log in successfully.Your username is " +
                                 self.username)
                return self.session
            else:
                self.logger.info("Login failed")
                return False
        except Exception as e:
            self.logger.error('连接无响应', exc_info=True)
            return False

    def get_illust_ids(self, illuster_id):
        """
        根据画师Id获得画师的插画IdList
        插画包括动图和静态图
        :param illuster_id: 画师id
        :return:
        """
        try:
            # 获得所有的图片Id
            illust_id_json = self.get(
                self.URL_4_GET_ALL_WORK_ID.format(userId=illuster_id)).text
            illust_id_json = str2json(illust_id_json)
            ill_ids = illust_id_json["body"]["illusts"]
            # 示例:{[illust_id]:[illust_info],[illust_id]:[illust_info],[illust_id]:[illust_info]...}
            if ill_ids != []:
                return list(ill_ids.keys())
            else:
                return []
        except Exception as e:
            self.logger.error("搜索画师失败!" + repr(e) + "illuster_id是" +
                              illuster_id,
                              exc_info=True)
            return None

    def get_manga_ids(self, illuster_id):
        """
        根据画师Id获得画师的漫画Id
        :param illuster_id:
        :return:
        """
        illust_id_json = self.get(
            self.URL_4_GET_ALL_WORK_ID.format(
                userId=illuster_id)).text  # 获得所有的图片Id
        illust_id_json = str2json(illust_id_json)
        manga_ids = illust_id_json["body"]["manga"]
        return list(manga_ids.keys())

    def get_img_info_by_img_id(self, illust_id):
        """
        根据插画或漫画的Id获得插画或者漫画的详细信息
        :param img_id:
        :return:
        """
        try:
            # info = self.session.get(self.URL_ILLUST_PAGE.format(illust_id=illust_id), timeout=self.timeout,
            #                         cookies={"cookies": self.cookie}).text
            info = self.get(
                self.URL_ILLUST_PAGE.format(illust_id=illust_id)).text
            illust_info_json = str2json(info)
            if self.isError(illust_info_json):
                self.logger.error(
                    "找不到illust信息,illust_id is {},error message is {}".format(
                        illust_id, illust_info_json['message']),
                    exc_info=True)
                return None
            body = illust_info_json['body']
            illust_type = body['illustType']
            page_count = body['pageCount']
            restrict = body['restrict']
            x_restrict = body['xRestrict']
            title = body['title']
            url = body['urls']['original']
            illuster_id = body['userId']
            return title, url, page_count, restrict, x_restrict, illust_type, illuster_id
        except Exception as e:
            self.logger.error("获取插画信息失败" + repr(e) + "illust_id是" + illust_id,
                              exc_info=True)
            return None

    # def is_illuster_exist(self, illuster_id):
    #     try:
    #         illust_id_json = self.get(self.URL_4_GET_ALL_WORK_ID.format(userId=illuster_id)).text  # 获得所有的图片Id
    #         illust_id_json = str2json(illust_id_json)
    #         if self.isError(illust_id_json):
    #             self.logger.error("画师不存在!illuster_id是{}".format(illuster_id), exc_info=True)
    #             return False
    #         else:
    #             return True
    #     except Exception as e:
    #         self.logger.error("无法判断画师是否存在!illuster_id是{}".format(illuster_id), exc_info=True)
    #         return None

    def get_concerned_illuster_info(self, type_, current_page_num):
        url = 'https://www.pixiv.net/bookmark.php?type=user&rest={}&p={}'.format(
            type_, current_page_num)
        html = self.get(url).text
        ids = self.illuster_id_from_user.findall(html)
        profiles = self.illuster_profile_from_user.findall(html)
        usernames = self.illuster_username_from_user.findall(html)
        return ids, profiles, usernames

    def get_show_pagenum(self, html):
        page = self.pagenum_4_show.findall(html)
        return len(page)

    def get_hide_pagenum(self, html):
        page = self.pagenum_4_hide.findall(html)
        return len(page)

    def get_concerned_illusters_pagenum(self, type_: str):
        """获得关注的画家的大致信息
        type 为 hide或者show"""
        content = self.session.get(
            'https://www.pixiv.net/bookmark.php?type=user&rest={}'.format(
                type_),
            cookies={
                "cookies": self.cookie
            }).text
        page_num = 0
        if type_ == 'hide':
            page_num = self.get_hide_pagenum(content)
        elif type_ == 'show':
            page_num = self.get_show_pagenum(content)
        return page_num

    def get_pagenum_hide(self, html):
        page = self.pagenum_4_hide.findall(html)
        return len(page)

    def get_ugoira_info(self, illust_id):
        #  插画id 下载illust信息
        gif_info = str2json(
            self.session.get(self.UGOIRA_URL.format(illust_id=illust_id),
                             cookies={
                                 "cookies": self.cookie
                             }).text)
        # print(gif_info)
        delays = [item["delay"] for item in gif_info["body"]["frames"]]
        frames = {f['file']: f['delay'] for f in gif_info["body"]['frames']}
        page_num = len(delays)
        zip_url = gif_info["body"]["originalSrc"]
        return frames, page_num, zip_url, delays

    def download_ugoira(self, save_dir, illust_id, illuster_id, title, url,
                        restrict, x_restrict):
        try:
            frames, page_num, zip_url, delays = self.get_ugoira_info(illust_id)
            # self.insert_ugoira(illust_id, title, url, illuster_id, page_num=None,
            #                    restrict=restrict, x_restrict=x_restrict, status=WorkStatus.done.value)
            # page_num = self.spider_util.download_ugoira(illust_id,  dest, self.session)
            gif_path = os.path.join(save_dir, illust_id + ".gif")
            zip_path = os.path.join(save_dir, illust_id + ".zip")
            res = self.spider_util.download_ugoira(zip_url, zip_path,
                                                   self.headers)
            delays = [str(i) for i in delays]
            delays = ",".join(delays)
            if res:
                # 下载成功
                self.insert_ugoira(illust_id,
                                   title,
                                   url,
                                   illuster_id,
                                   page_num=page_num,
                                   restrict=restrict,
                                   x_restrict=x_restrict,
                                   status=WorkStatus.done.value,
                                   delays=delays)
            else:
                self.insert_ugoira(illust_id,
                                   title,
                                   url,
                                   illuster_id,
                                   page_num=page_num,
                                   restrict=restrict,
                                   x_restrict=x_restrict,
                                   status=WorkStatus.failure.value,
                                   delays=delays)
        except Exception:
            self.logger.error('下载动图时,遇到问题', exc_info=True)

    def insert_illust(self,
                      illust_id,
                      title=None,
                      url=None,
                      illuster_id=None,
                      page_num=None,
                      restrict=None,
                      x_restrict=None,
                      status=None):
        select_sql = 'SELECT id  FROM illust WHERE illust_id = %s '
        res = self.db_util.get_one(select_sql, illust_id)
        loc_url = str(illuster_id) + "/" + url.split('/')[-1]
        if not res:  # 数据库里没找到相关信息
            insert_sql = 'INSERT INTO illust(title, url, illust_id, illuster_id, page_no, `type`, status,`restrict`,' \
                         'x_restrict, loc_url)  VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
            self.db_util.insert(insert_sql,
                                (title, url, illust_id, illuster_id, page_num,
                                 IllustType.illust.value, status, restrict,
                                 x_restrict, loc_url))
        else:
            update_sql = 'UPDATE illust SET status = %s, page_no=%s, `restrict`=%s, x_restrict=%s, title=%s, ' \
                         'loc_url=%s, type=%s WHERE illust_id=%s'
            self.db_util.update(update_sql,
                                (status, page_num, restrict, x_restrict, title,
                                 loc_url, IllustType.illust.value, illust_id))

    def insert_ugoira(self,
                      illust_id,
                      title,
                      url,
                      illuster_id,
                      page_num,
                      restrict=None,
                      x_restrict=None,
                      status=None,
                      delays=None):
        select_sql = 'SELECT id  FROM illust WHERE illust_id = %s AND type = {}'.format(
            IllustType.ugoira.value)
        res = self.db_util.get_one(select_sql, illust_id)
        loc_url = str(illuster_id) + "/" + url.split('/')[-1]
        if not res:  # 数据库里没找到相关信息
            insert_sql = "INSERT INTO ugoira( ugoira_id, delays) VALUES (%s, %s)"
            self.db_util.insert(insert_sql, (illust_id, delays))
            insert_sql = 'INSERT INTO illust(title, url, illust_id, illuster_id, page_no, `type`, status,`restrict`,' \
                         'x_restrict, loc_url)  VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
            self.db_util.insert(insert_sql,
                                (title, url, illust_id, illuster_id, page_num,
                                 IllustType.ugoira.value, status, restrict,
                                 x_restrict, loc_url))
        else:
            update_sql = 'UPDATE illust SET status = %s, page_no=%s, `restrict`=%s, x_restrict=%s, title=%s, ' \
                         'loc_url=%s, type=%s WHERE illust_id=%s'
            self.db_util.update(
                update_sql, (WorkStatus.done.value, page_num, restrict,
                             x_restrict, title, loc_url, status, illust_id))

    def update_illust(self,
                      illust_id,
                      page_num,
                      illust_status=WorkStatus.done.value):
        # select_sql = 'SELECT status  FROM illust WHERE illust_id = %s '
        update_sql = 'UPDATE illust SET status = %s , page_no=%s WHERE illust_id = %s'
        self.db_util.update(update_sql, (illust_status, page_num, illust_id))
        # if self.db_util.get_one(select_sql, (illust_id)):
        #     update_sql = 'UPDATE illust SET status = %s, page_no = %s WHERE illust_id = %s'
        #     self.db_util.update(update_sql, (WorkStatus.done.value, page_num, illust_id))
        # else:
        #     self.logger.error("")
        # insert_sql = 'INSERT INTO illust(title, url, illust_id, illuster_id, page_no, `type`, status,`restrict`,' \
        #              'x_restrict)  VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)'
        # self.db_util.insert(insert_sql, (title, url, illust_id, illuster_id, page_num, illust_type,
        #                                      WorkStatus.waiting.value, restrict, x_restrict))

    def isError(self, json):
        return json['error']

    def download_manga(self, illust_id, illuster_id, title, img_url,
                       page_count, restrict, x_restrict):
        raise NotImplementedError
        pass

    def get(self, url):
        return self.session.get(url,
                                cookies={"cookies": self.cookie},
                                timeout=self.timeout)

    def get_illuster_info(self, illuster_id):
        # content = self.get(self.URL_4_GET_ILLUST_ID.format(userId=illuster_id)).text
        try:
            content = self.get(
                self.URL_4_GET_ILLUSTER_INFO.format(illuster_id)).text
            content_json = str2json(content)
            if self.isError(content_json):
                self.logger.error("画师不存在!illuster_id是{}".format(illuster_id),
                                  exc_info=True)
                return None
            name = content_json['body']['name']
            img_url = content_json['body']['imageBig']
            # print(content_json['extraData']['meta']['title'])
            # pat = re.compile('「(.*)」的个人资料 - pixiv')
            # # pat.search(content_json['extraData']['meta']['title'])
            # # pat = re.compile('「(.*)」的个人资料 - pixiv')
            # name = pat.findall(content_json['extraData']['meta']['title'])[0]
            return name, img_url
        except Exception:
            self.logger.error("无法判断画师是否存在!illuster_id是{}".format(illuster_id),
                              exc_info=True)
            return None
Beispiel #9
0
 def __init__(self):
     self.session = MysqlUtil().get_session()
Beispiel #10
0
class AuthorDAO():
    def __init__(self):
        self.session = MysqlUtil().get_session()

    def get_authors(self):
        return self.session.query(Author).all()

    def get_author_by_name(self, auth_name):
        return self.session.query(Author).filter_by(name=auth_name).first()

    def get_author_by_id(self, auth_id):
        return self.session.query(Author).filter_by(id=auth_id).first()

    def add_author(self, auth_name):
        if self.get_author_by_name(auth_name) is None:
            auth = Author(name=auth_name)
            self.session.add(auth)
            self.session.commit()
            return True, u'添加成功'
        else:
            return False, u'作者已经存在!'

    def delete_author(self, auth_name):
        auth = self.get_author_by_name(auth_name)
        if auth is None:
            return False, u'作者不存在!'
        self.session.delete(auth)
        self.session.commit()
        return True, u'删除作者成功!'

    def update_author(self, auth_id, auth_name):
        auth = self.get_author_by_name(auth_name)
        if auth is not None:
            return False, u'作者名已经存在!'
        else:
            auth = self.get_author_by_id(auth_id)
            auth.name = auth_name
            self.session.commit()
            return True, u'修改成功!'
Beispiel #11
0
def main():
    is_new = input('是否重新导出数据? yes/no:')
    version = time.time()
    if is_new == 'no':
        version = input('请输入历史版本 ps:1601260075.6850908:')
        if not os.path.exists('./.sql/dump/{0}'.format(version)):
            raise Exception('版本不存在')
    path_dump = './.sql/dump/{0}'.format(version)
    if not os.path.exists(path_dump):
        os.makedirs(path_dump)
    db_from_config = {
        'ip': '',
        'port': 3306,
        'user': '',
        'passwd': '',
        'db': ''
    }
    db_to_config = {'ip': '', 'port': 3306, 'user': '', 'passwd': '', 'db': ''}
    if 'rds' in db_to_config['ip']:
        raise Exception('导入数据库存在敏感地址,请再次验证')
    all_tables = []
    mysql_util = MysqlUtil(config=db_from_config)
    sql_all_tables = 'SHOW TABLES'
    sql_all_tables_result = mysql_util.query_sql(sql_all_tables)
    for table in list(sql_all_tables_result):
        all_tables.append(table[0])

    if is_new == 'yes':
        for t in all_tables:
            print('----> from database <{0}>'.format(db_from_config['ip']))
            print('----> dump table <{0}>'.format(t))
            cmd_dump_sql = 'mysqldump --set-gtid-purged=off -h{h} -P{P} -u{u} -p{p} {db} --tables  {tb}>{path_dump}/{tb}.sql'.format(
                h=db_from_config['ip'],
                P=db_from_config['port'],
                u=db_from_config['user'],
                p=db_from_config['passwd'],
                db=db_from_config['db'],
                path_dump=path_dump,
                tb=t)
            execute_cmd(cmd_dump_sql)
    verify_code = ''.join(
        random.sample([
            'z', 'y', 'x', 'w', 'v', 'u', 't', 's', 'r', 'q', 'p', 'o', 'n',
            'm', 'l', 'k', 'j', 'i', 'h', 'g', 'f', 'e', 'd', 'c', 'b', 'a'
        ], 5))
    print('----------WARNING START----------')
    print('----------infomation below must be readed carefully!----------')
    print(
        '----------or it will make a unexpectable and harmful issue----------')
    print('----------FROM DATABASE INFO----------')
    print('----------IP {0}----------'.format(db_from_config['ip']))
    print('----------DATABASE {0}----------'.format(db_from_config['db']))
    print('----------TO DATABASE INFO----------')
    print('----------IP {0}----------'.format(db_to_config['ip']))
    print('----------DATABASE {0}----------'.format(db_to_config['db']))
    print('----------WARNING END----------')
    print('----------VERIFY CODE: {0}------'.format(verify_code))
    is_confirm_do = input('已阅读警告 yes/no:')
    if is_confirm_do == 'no':
        print('程序退出')
        return
    if is_confirm_do == 'yes':
        check_verify_code = input('输入上述验证码:')
        if check_verify_code != verify_code:
            raise Exception('验证码错误')
        else:
            for t in all_tables:
                print('----> to database <{0}>'.format(db_to_config['ip']))
                print('----> import table <{0}>'.format(t))
                cmd_import_sql = 'mysql -h{h} -P{P} -u{u} -p{p} {db}<{path_dump}/{tb}.sql'.format(
                    h=db_to_config['ip'],
                    P=db_to_config['port'],
                    u=db_to_config['user'],
                    p=db_to_config['passwd'],
                    db=db_to_config['db'],
                    path_dump=path_dump,
                    tb=t)
                execute_cmd(cmd_import_sql)
Beispiel #12
0
# |\   __  \|\  \|\  \|\___   ___\\   __  \
# \ \  \|\  \ \  \\\  \|___ \  \_\ \  \|\  \
#  \ \   __  \ \  \\\  \   \ \  \ \ \  \\\  \
#   \ \  \ \  \ \  \\\  \   \ \  \ \ \  \\\  \
#    \ \__\ \__\ \_______\   \ \__\ \ \_______\
#     \|__|\|__|\|_______|    \|__|  \|_______|

#  ________  ________  ________  ___       __   ___       _______   ________
# |\   ____\|\   __  \|\   __  \|\  \     |\  \|\  \     |\  ___ \ |\   __  \
# \ \  \___|\ \  \|\  \ \  \|\  \ \  \    \ \  \ \  \    \ \   __/|\ \  \|\  \
#  \ \  \    \ \   _  _\ \   __  \ \  \  __\ \  \ \  \    \ \  \_|/_\ \   _  _\
#   \ \  \____\ \  \\  \\ \  \ \  \ \  \|\__\_\  \ \  \____\ \  \_|\ \ \  \\  \|
#    \ \_______\ \__\\ _\\ \__\ \__\ \____________\ \_______\ \_______\ \__\\ _\
#     \|_______|\|__|\|__|\|__|\|__|\|____________|\|_______|\|_______|\|__|\|__|

conn = MysqlUtil()
#tags = conn.get_all('SELECT * FROM `test1.0`.`tag_Asyn`')
ids = conn.get_all(
    'SELECT `test1.0`.`book_Asyn`.`book_id` FROM `test1.0`.`book_Asyn` LEFT JOIN `test1.0`.`book_detail` ON `test1.0`.`book_Asyn`.`book_id` = `test1.0`.`book_detail`.`book_id` WHERE `book_introduct` IS NULL;'
)
print(ids)
#cids = conn.get_all(' SELECT `test1.0`.`book_Asyn`.`book_id` FROM `test1.0`.`book_Asyn` LEFT JOIN `commenttable_Asyn` cA on `book_Asyn`.`book_id` = cA.`book_id` WHERE `comment` IS NULL; ')
runner = CrawlerRunner(get_project_settings())


@defer.inlineCallbacks
def crawl():
    while True:
        for tag in tags:
            print('*********************')
            print('\t\a' + tag[0])
Beispiel #13
0
#!/usr/bin/env python
# coding=UTF-8
'''
 # Desc:
 # Author:TavisD 
 # Time:2016-10-10 10:53
 # Ver:V1.0
'''

from utils.api_client import APIClient
from utils.file_util import FileUtil
from utils.mysql_util import MysqlUtil
from utils.gen_util import GenUtil

api_client = APIClient()
file_util = FileUtil()
mysql_util = MysqlUtil()
gen_util = GenUtil()
Beispiel #14
0
class PixivSpider():
    def load_config(self):
        # 加载配置文件
        self.config = ConfigureUtil('config/config.conf')
        try:
            self.thread_num = self.config.get("app", "thread_num", type_="int")
            # 下载图片存储的根目录
            self.root_path = self.config.get("download", "path")
            # 下载循环等待的时间
            self.waiting_time = self.config.get("download",
                                                "waiting_time",
                                                type_="int")
        except Exception as e:
            self.logger.error("请检查你配置的下载路径====》{}".format(repr(e)),
                              exc_info=True)
            raise e

    def load_logger(self):
        self.logger = Log(__name__).get_log()

    def __init__(self):
        # 加载配置文件
        self.sql_4_update = 'update illuster set priority = 0,  modify_time = "{}" WHERE illuster_id  = {}'
        self.load_logger()
        self.load_config()
        self.pixiv_util = PixivUtil()
        self.db_util = DbUtil()
        self.pool = threadpool.ThreadPool(self.thread_num)
        self.WAITING_SQL = 'select illuster_id from illuster WHERE priority > 0 ' \
                           ' AND illuster_id!=11 ORDER BY priority DESC LIMIT {} '
        ssl._create_default_https_context = ssl._create_unverified_context
        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

        #  测试。用来解决, 'ssl3_read_bytes', 'sslv3 alert bad record mac')
        requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL'

    def login(self):
        is_login = self.pixiv_util.login()
        if not is_login:
            self.logger.info("登陆失败,请检查账号密码是否正确和网络是否连通")
            return False
        else:
            return True

    def save_concerned_illuster_info(self, type_: str):
        """
        根据用户id找到用户关注的画师,保存到数据库
        :param type_: hide或者show hide表示未公开,show表示已公开
        :return:
        """
        if not self.login():
            return
        page_num = self.pixiv_util.get_concerned_illusters_pagenum(type_)
        for current_page_num in range(1, page_num + 1):
            ids_, profiles, usernames = self.pixiv_util.get_concerned_illuster_info(
                type_, current_page_num)
            for illuster_id, profile, username in zip(ids_, profiles,
                                                      usernames):
                res = self.db_util.get_one(
                    "select id from illuster where illuster_id = " +
                    illuster_id)
                str_time = get_time()
                if res:
                    self.logger.debug("该画师已存在" + illuster_id)
                    self.db_util.update(
                        'UPDATE illuster SET name=%s, image_url=%s, modify_time=%s, priority=%s '
                        'WHERE illuster_id =%s',
                        (username, profile, str_time, 0, illuster_id))
                else:
                    self.db_util.insert(
                        "insert into illuster(illuster_id,name,image_url,create_time,modify_time,priority) "
                        "value(" + illuster_id + ",'" + username + "','" +
                        profile + "','" + str_time + "','" + str_time + "',5)")
        self.logger.info("信息保存完毕")

    def get_illuster_ids(self, num, use_data_base=False):
        """
        获得num个未爬取的画家id
        优先使用配置文件中的画家id,然后是数据库中待爬取的
        """
        ids = self.config.get("download", "illuster_ids")
        ids = [] if ids is None else ids.split()
        if len(ids) > num - 1:
            return ids[:10]
        elif use_data_base:
            illuster_ids = self.db_util.get_all(self.WAITING_SQL.format(10))
            for illuster_id in illuster_ids:
                ids.append(str(illuster_id[0]))
            temp = list(set(ids))[:10]
            temp.sort(key=ids.index)
            return temp
        else:
            return ids[:10]

    def main(self, batch_size=10, use_database=False):
        """
        自动读取配置文件或者数据库,获得需要爬取的作家的id
        配置文件要更加优先
        找到画家id爬取画家的作品
        """
        while True:
            illuster_ids = self.get_illuster_ids(batch_size, use_database)
            if len(illuster_ids) == 0:
                time.sleep(60 * 60 * 4)
            for illuster_id in illuster_ids:
                self.logger.info("正在下载的是" + illuster_id + "的作品")
                if use_database:
                    res = self.db_util.get_one(
                        "select id from illuster where illuster_id = %s",
                        illuster_id)
                    # 爬取画师有关信息
                    str_time = get_time()
                    # 先判断画师是否存在于pixiv网站
                    # is_exist = self.pixiv_util.is_illuster_exist(illuster_id)
                    info = self.pixiv_util.get_illuster_info(illuster_id)
                    if info is None:
                        self.db_util.insert(
                            "update illuster set priority = -1 WHERE illuster_id = %s",
                            illuster_id)
                        continue
                    else:
                        name, img_url = info
                    if res:
                        self.logger.debug("该画师已存在数据库中" + illuster_id)
                        self.db_util.update(
                            "update illuster set name = %s, image_url = %s, modify_time = %s where illuster_id = %s",
                            (name, img_url, str_time, illuster_id))
                    else:
                        self.db_util.insert(
                            "insert into illuster(illuster_id, name, image_url, create_time,modify_time,priority)"
                            " value(%s, %s, %s, %s, %s,5)",
                            (illuster_id, name, img_url, str_time, str_time))
                dir_ = os.path.join(self.root_path, illuster_id)
                make_directory(dir_)
                illust_ids = self.pixiv_util.get_illust_ids(illuster_id)
                if illuster_ids is None:
                    self.logger.error(
                        '获得插画列表失败,画师id为{}, 开始下一个画师的信息爬取'.format(illuster_id))
                    continue
                illust_num = len(illust_ids)
                self.logger.info("共有插图%d个" % (illust_num))
                if use_database:
                    if res:
                        self.logger.debug("该画师已存在数据库中" + illuster_id)
                        self.db_util.update(
                            "update illuster set name = %s, image_url = %s, modify_time = %s, illust_num = %s where illuster_id = %s",
                            (name, img_url, str_time, illust_num, illuster_id))
                    else:
                        self.db_util.insert(
                            "insert into illuster(illuster_id, name, image_url, create_time,modify_time,illust_num, priority)"
                            " value(%s, %s, %s, %s, %s, %s, 5)",
                            (illuster_id, name, img_url, str_time, str_time,
                             illust_num))
                if illust_num > 0:
                    if use_database:
                        illust_ids = self.pixiv_util.filter_4_downloaded_work(
                            illust_ids)
                    # 准备循环的数据
                    var_list = []
                    for illust_id in illust_ids:  # 循环的到的illust_id
                        var_list.append(([dir_, illust_id], None))
                    if self.pool is None:
                        self.pool = threadpool.ThreadPool(self.thread_num)
                    tasks = threadpool.makeRequests(
                        self.pixiv_util.download_work_by_illust_id, var_list)
                    [self.pool.putRequest(task) for task in tasks]
                    self.pool.wait()

                # 直接删除第一个id,因为第一个id最先下载
                il_ids = self.config.get("download", "illuster_ids")
                if il_ids is not None:
                    self.config.update("download", "illuster_ids",
                                       " ".join(il_ids.split()[1:]))
                if use_database:
                    self.db_util.update(
                        self.sql_4_update.format(get_time(), illuster_id))
                self.logger.info("{}的作品下载完成".format(illuster_id))
            self.logger.info('one batch is over')
            time.sleep(self.waiting_time)
        self.logger.info("所有的作品下载完成")
        session.close()
Beispiel #15
0
 def __init__(self, path):
     self.db_util = MysqlUtil()
     self.spdier_util = SpiderUtil()
     self.root_path = path
     self.logger = Log(__name__).get_log()
Beispiel #16
0
class Downloader:
    def __init__(self, path):
        self.db_util = MysqlUtil()
        self.spdier_util = SpiderUtil()
        self.root_path = path
        self.logger = Log(__name__).get_log()

    def get_info(self, select_sql):
        """
        根据sql查询要爬取的数据
        """
        res = self.db_util.get_all(select_sql)
        infos = []
        for info in res:
            type_ = info[3]
            if type_ == IllustType.ugoira.value:
                # todo 只下载zip文件,之后再处理缩略图
                url = Downloader.handle_url_4_ugoira(info[0])
            else:
                url = str(info[0], encoding='utf-8')
                if '.gif' in url:
                    # 暂不处理gif类型
                    continue
                if '.jpg' in url or '.png' in url:
                    pass
                else:
                    self.logger.warning('不支持的类型!illust_id为{}'.format(info[1]))
            infos.append((url, self.get_path(url,
                                             str(info[2])), info[4], info[1]))
        return infos

    def get_path(self, url: str, illuster_id: str):
        path = os.path.join(self.root_path, illuster_id)
        make_directory(path)
        return os.path.join(path, url.split("/")[-1])

    @staticmethod
    def handle_url_4_ugoira(url):
        url = str(url, encoding="utf-8")
        i = url.replace('img-original', 'img-zip-ugoira')
        i = i.replace('0.jpg', '1920x1080.zip')
        i = i.replace('0.png', '1920x1080.zip')
        return i

    def download_file(self, url, save_path, headers):
        res = self.spdier_util.download_file(url, save_path, headers)
        if res == 404:
            self.logger.error('无法找到文件,url为{}'.format(url))
            return 404
        if res and valid_file(save_path, file_extension(save_path)):
            return True
        else:
            return False

    def download_file_list(self, url, save_path, headers, page_num: int,
                           illust_id):
        """当一个illust_id下的所有插画都下载完成才更新状态"""
        res = True
        for i in range(page_num):
            url_temp = url.replace('_p0', '_p{}'.format(i))
            save_path_temp = save_path.replace('_p0', '_p{}'.format(i))
            res = res and self.download_file(url_temp, save_path_temp, headers)
            if res == 404:
                self.db_util.update(
                    'UPDATE illust SET status = {} WHERE illust_id = {}'.
                    format(WorkStatus.unfound.value, illust_id))
                return
        if res:
            self.db_util.update(
                'UPDATE illust SET status = {} WHERE illust_id = {}'.format(
                    WorkStatus.complete.value, illust_id))

    def main(self, select_sql, headers, thread_num, interval):
        """
        查询url数据,然后自动下载
        :param select_sql:  查询数据的sql语句
        :param headers:  请求头设置
        :param thread_num:  下载线程数
        :param interval:  循环查询数据库的时间间隔
        :return:
        """
        pool = None
        while True:
            infos = self.get_info(select_sql)
            print('get info from database')
            if infos:
                param_list = [([i[0], i[1], headers, i[2], i[3]], None)
                              for i in infos]
                if pool is None:
                    pool = threadpool.ThreadPool(thread_num)
                tasks = threadpool.makeRequests(self.download_file_list,
                                                param_list)
                [pool.putRequest(task) for task in tasks]
                pool.wait()
            self.logger.info('one batch is over')
            print('one batch is over')
            time.sleep(interval)
Beispiel #17
0
class ArticleDao:
    def __init__(self):
        self.session = MysqlUtil().get_session()

    def get_articles(self, page=0, size=20):
        article_num = self.get_article_num()
        if page * size > article_num:
            return None, u'分页数超出最大值'
        info = u'more' if (page + 1) * size < article_num else u'nomore'
        return self.session.query(Article).order_by(desc(
            Article.modified_time)).offset(size * page).limit(size).all(), info

    def get_article_num(self):
        return self.session.query(Article).count()

    def get_article_by_title(self, title):
        return self.session.query(Article).filter_by(title=title).first()

    def get_article_by_id(self, id):
        return self.session.query(Article).filter_by(id=id).first()

    def new_article(self, title, author_id, author_name, cate_id, cate_name,
                    intro, filepath, tags):
        article = Article(title=title,
                          intro=intro,
                          is_public=1,
                          auth_id=author_id,
                          auth_name=author_name,
                          cate_id=cate_id,
                          cate_name=cate_name,
                          file_path=filepath,
                          tags=tags,
                          create_time=datetime.datetime.now(),
                          modified_time=datetime.datetime.now())

        if self.get_article_by_title(title) is None:
            self.session.add(article)
            self.session.commit()
            return True, u'成功添加新文章!'
        else:
            return False, u'文章标题已经存在!'

    def delete_article(self, article):
        self.session.delete(article)
        self.session.commit()
        return True, u'删除成功!'

    def get_articles_by_cate(self, cate, page=0, size=20):
        article_num = self.get_article_num()
        if page * size > article_num:
            return None, u'分页数超出最大值'
        info = u'more' if (page + 1) * size < article_num else u'nomore'
        return self.session.query(Article).filter_by(cate_id=cate).order_by(
            desc(Article.modified_time)).offset(size *
                                                page).limit(size).all(), info

    def get_article_by_tag(self, tag, page=0, size=20):
        article_num = self.get_article_num()
        if page * size > article_num:
            return None, u'分页数超出最大值'
        info = u'more' if (page + 1) * size < article_num else u'nomore'
        return self.session.query(Article)\
                   .filter(Article.tags.like(u'%{tag}%'.format(tag=tag)))\
                   .order_by(desc(Article.modified_time)).offset(size * page).limit(size).all(), info
Beispiel #18
0
class checker():
    def __init__(self, root_path):
        self.root_path = root_path
        self.SELECT_SQL = "SELECT page_no, loc_url,status FROM illust WHERE illust_id={}"
        self.RESET_ILLUSTER_SQL = "UPDATE illust SET status = 1 WHERE illuster_id = {}"
        self.RESET_ILLUST_SQL = "UPDATE illust SET status = 1 WHERE illuster_id = {} AND illust_id NOT IN ({})"
        self.FILTER_SQL = "SELECT illust_id FROM illust WHERE status = 10 AND illust_id in ({})"
        # 设priority = 6代表已经检查完毕
        self.MAKE_ILLUSTER_STATUS_DONE_SQL = "UPDATE illuster SET priority = 6 WHERE illuster_id = {}"
        self.CHECK_IF_DONE = 'SELECT priority FROM illuster WHERE illuster_id={}'
        self.GET_DONE_ILLUSTER = 'SELECT illuster_id FROM illuster WHERE priority=6'
        self.db_util = MysqlUtil()
        self.logger = Log(__name__, log_cate='checker').get_log()
        self.before_illuster_id = None

    def get_done_illuster(self):
        illuster_ids = self.db_util.get_all(self.GET_DONE_ILLUSTER)
        done_illuster_ids = [i[0] for i in illuster_ids]
        return done_illuster_ids

    def check_empty_dir(self, ignore=[]):
        try:
            ignore = ignore + self.get_done_illuster()
            for file_name in os.listdir(self.root_path):
                if file_name in ignore:
                    continue
                print(file_name)
                # path 即 illuster_id
                if self.before_illuster_id is not None:
                    # 将上一个设为完成
                    self.db_util.update(
                        self.MAKE_ILLUSTER_STATUS_DONE_SQL.format(
                            self.before_illuster_id))
                illuster_id = file_name
                self.before_illuster_id = illuster_id
                # 如果当前的illuster已经处理过了,就跳到下一个
                if self.db_util.get_one(
                        self.CHECK_IF_DONE.format(illuster_id))[0] == 6:
                    continue
                path = os.path.join(self.root_path, file_name)
                if os.path.isdir(path):
                    image_files = os.listdir(path)
                    if len(image_files) == 0:
                        print("no images in {}".format(path))
                        self.db_util.update(
                            self.RESET_ILLUSTER_SQL.format(illuster_id))
                        continue
        except Exception as e:
            self.logger.error('some problem happen', exc_info=1)
            raise e

    def check(self, ignore=[]):
        try:
            ignore = ignore + self.get_done_illuster()
            for file_name in os.listdir(self.root_path):
                if file_name in ignore:
                    continue
                print(file_name)
                # path 即 illuster_id
                if self.before_illuster_id is not None:
                    # 将上一个设为完成
                    self.db_util.update(
                        self.MAKE_ILLUSTER_STATUS_DONE_SQL.format(
                            self.before_illuster_id))
                illuster_id = file_name
                self.before_illuster_id = illuster_id
                if self.db_util.get_one(
                        self.CHECK_IF_DONE.format(illuster_id))[0] == 6:
                    continue
                path = os.path.join(self.root_path, file_name)
                if os.path.isdir(path):
                    image_files = os.listdir(path)
                    if len(image_files) == 0:
                        print("no images in {}".format(path))
                        self.db_util.update(
                            self.RESET_ILLUSTER_SQL.format(illuster_id))
                        continue
                    illust_ids = []
                    for file_ in image_files:
                        if file_.endswith('.zip'):
                            illust_id = file_.replace('.zip', '')
                        elif file_.endswith('.gif'):
                            # todo 暂不处理gif
                            continue
                        else:
                            illust_id = file_.split("_p")[0]
                        if illust_id not in illust_ids:
                            illust_ids.append(illust_id)
                    complete_illust_ids = []
                    # if not illust_ids:
                    #     continue
                    # illust_id_list = self.db_util.get_all(self.FILTER_SQL.format(','.join(illust_ids)))
                    # illust_ids = [i[0] for i in illust_id_list]
                    # if illust_ids:
                    #     continue
                    for illust_id in illust_ids:
                        res = True
                        info = self.db_util.get_one(
                            self.SELECT_SQL.format(illust_id))
                        # 如果是数据库中没有信息
                        if not info:
                            self.logger.info(
                                '数据库中没有信息,illust_id为{}'.format(illust_id))
                            print(illust_id)
                            continue
                        (page_no, loc_url, status) = info
                        if status < WorkStatus.done.value:  # 10是下载成功状态,小于10 表示不成功
                            continue
                        loc_url = str(loc_url, encoding='utf-8')
                        if 'gif' in loc_url:
                            # 暂时不处理gif,理论上。gif文件不会进入这里来
                            complete_illust_ids.append(illust_id)
                            continue
                        elif 'ugoira' in loc_url:
                            loc_url = loc_url.replace('_ugoira0.jpg', '.zip')
                            loc_url_temp = loc_url.replace(
                                '_ugoira0.png', '.zip')
                            if valid_file(
                                    os.path.join(self.root_path,
                                                 loc_url_temp)):
                                complete_illust_ids.append(illust_id)
                            continue
                        for i in range(page_no):
                            loc_url_temp = loc_url.replace(
                                "_p0", "_p{}".format(i))
                            res = res and valid_file(
                                os.path.join(self.root_path, loc_url_temp))
                        # 理论上只有完整的jpg,png到这边来
                        if res:
                            complete_illust_ids.append(illust_id)
                    if not complete_illust_ids:
                        # 全部设为未完成
                        self.db_util.update(
                            self.RESET_ILLUSTER_SQL.format(illuster_id))
                        # self.db_util.update(self.RESET_ILLUST_SQL.format(",".join(incomplete_illust_ids)))
                    else:
                        self.db_util.update(
                            self.RESET_ILLUST_SQL.format(
                                illuster_id, ','.join(complete_illust_ids)))
                    print(complete_illust_ids)
            # 把最后一个illuster_id进行处理
            if self.before_illuster_id is not None:
                self.db_util.update(
                    self.MAKE_ILLUSTER_STATUS_DONE_SQL.format(
                        self.before_illuster_id))
        except Exception as e:
            self.logger.error('some problem happen', exc_info=1)
            raise e
Beispiel #19
0
class CategoryDAO:
    def __init__(self):
        self.session = MysqlUtil().get_session()

    def add_category(self, cate_name):
        if self.get_cate_by_name(cate_name) is None:
            cate = Category(cate_name=cate_name)
            self.session.add(cate)
            self.session.commit()
            return True, u'成功添加新分类'
        else:
            return False, u'类目已经存在'

    def get_cate_by_name(self, cate_name):
        cate = self.session.query(Category).filter_by(
            cate_name=cate_name).first()
        if cate is not None:
            return cate
        else:
            return None

    def get_cate_by_id(self, cate_id):
        cate = self.session.query(Category).filter_by(id=cate_id).first()
        if cate is not None:
            return cate
        else:
            return None

    def get_categories(self):
        return self.session.query(Category).all()

    def delete_category(self, cate_id):
        cate = self.get_cate_by_id(cate_id)
        if cate is not None:
            self.session.delete(cate)
            self.session.commit()
            return True, u'删除成功'
        else:
            return False, u'删除失败,删除的类目不存在!'

    def update_category(self, cate_id, cate_name):
        cate = self.get_cate_by_id(cate_id)
        if cate is not None:
            if self.get_cate_by_id(cate_name) is None:
                cate.cate_name = cate_name
                cate.modify_time = datetime.datetime.utcnow()
                self.session.commit()
                return True, u'修改成功!'
            else:
                return False, u'修改的类目名称已经存在!'
        else:
            return False, u'指定ID的类目不存在!'
class GenFile():
    def __init__(self):
        self.path_tpl_entity = PathUtil.root('templates') + '/entity.tpl'
        self.path_tpl_mapper = PathUtil.root('templates') + '/mapper.tpl'
        self.project_java = 'test'
        self.mysql_util = MysqlUtil({
            'ip': 'localhost',
            'user': '******',
            'passwd': '123456',
            'db': 'tacomall'
        })
        self.all_tables = []
        self._query_all_tables()

    def _query_all_tables(self):
        sql_all_tables = '''
        SELECT
            table_name
        FROM
            information_schema.`TABLES`
        WHERE
            TABLE_SCHEMA = 'tacomall';
        '''
        result_sql_all_tables = self.mysql_util.query_sql(sql_all_tables)
        self.all_tables = list(map(lambda i: i[0], result_sql_all_tables))

    def _load_tpl(self, tpl_type=1):
        path = ''
        if tpl_type == 1:
            path = self.path_tpl_entity
        if tpl_type == 2:
            path = self.path_tpl_mapper
        tpl = ''
        with open(path) as f_tpl:
            tpl = f_tpl.read()
            f_tpl.close()
        return tpl

    def _2camel(self, s, capitalize=False):
        s = s.lower()
        class_name = ''
        for i, v in enumerate(s.split('_')):
            if i == 0 and not capitalize:
                class_name = v
                continue
            class_name = class_name + v.capitalize()
        return class_name

    def _gen_entity(self):
        tpl = self._load_tpl(1)
        for table in self.all_tables:
            new_tpl = ''
            spl_table = table.split('_')
            package = 'store.tacomall.db.entity.{0}'.format(spl_table[0])
            class_name = self._2camel(table, capitalize=True)
            fields_content = ''
            ignore_columns = [
                'id', 'is_delete', 'create_time', 'update_time', 'delete_time'
            ]
            filename_gen = PathUtil.root(self.project_java +
                                         '/entity/{0}/'.format(spl_table[0])
                                         ) + '{0}.java'.format(class_name)
            sql_table_structure = '''
            SELECT
                DATA_TYPE,
                column_name,
                column_comment
            FROM
                information_schema.COLUMNS
            WHERE
                table_name = '{0}';
            '''.format(table)
            if PathUtil.is_exists(filename_gen):
                print('实体类{0}.java文件已存在'.format(class_name))
                continue
            result_sql_table_structure = self.mysql_util.query_sql(
                sql_table_structure)
            for structure in list(
                    map(
                        lambda i: {
                            'type': i[0],
                            'column': i[1],
                            'comment': i[2]
                        }, result_sql_table_structure)):
                if structure['column'] in ignore_columns:
                    continue
                java_data_type = ''
                if structure['type'] == 'int':
                    java_data_type = 'int'
                if structure['type'] == 'varchar':
                    java_data_type = 'String'
                if structure['type'] == 'text':
                    java_data_type = 'String'
                if structure['type'] == 'datetime':
                    java_data_type = 'Date'
                fields_content = fields_content + \
                    '   private {0} {1}; // {2}\n\n'.format(
                        java_data_type, self._2camel(structure['column']), structure['comment'])
            new_tpl = tpl.replace('${PACKAGE}', package)
            new_tpl = new_tpl.replace('${CLASS}', class_name)
            new_tpl = new_tpl.replace('${FIELDS}', fields_content)
            with open(filename_gen, 'w+', encoding='utf8') as f_gen:
                f_gen.write(new_tpl)
                f_gen.close()

    def _gen_mapper(self):
        tpl = self._load_tpl(2)
        for table in self.all_tables:
            new_tpl = ''
            spl_table = table.split('_')
            package = 'store.tacomall.db.mapper.{0}'.format(spl_table[0])
            class_name = self._2camel(table, capitalize=True) + 'Mapper'
            filename_gen = PathUtil.root(self.project_java +
                                         '/mappper/{0}/'.format(spl_table[0])
                                         ) + '{0}.java'.format(class_name)
            if PathUtil.is_exists(filename_gen):
                print('mapper类{0}.java文件已存在'.format(class_name))
                continue
            new_tpl = tpl.replace('${PACKAGE}', package)
            new_tpl = new_tpl.replace('${CLASS}', class_name)
            new_tpl = new_tpl.replace('${ENTITY_DOMAIN}', spl_table[0])
            new_tpl = new_tpl.replace('${ENTITY_CLASS}',
                                      self._2camel(table, capitalize=True))
            with open(filename_gen, 'w+', encoding='utf8') as f_gen:
                f_gen.write(new_tpl)
                f_gen.close()

    def run(self):
        self._gen_entity()
        self._gen_mapper()