Python filter_text Examples

Programming Language: Python

Namespace/Package Name: common.robot

Method/Function: filter_text

Examples at hotexamples.com: 8

Python filter_text - 8 examples found. These are the top rated real world Python examples of common.robot.filter_text extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: Abase.py Project: yxw19870806/yxw19870806

    def main(self):
        page_count = 1
        image_count = 0
        main_thread_count = threading.activeCount()
        # 多线程下载类型
        # 1 同时开始N个下载线程
        # 2 对一页中的所有图片开启多线程下载，下完一页中的所有图片后开始下一页
        thread_type = 2
        while True:
            # 获取一页页面
            page_data = get_one_page_data(page_count)
            if page_data is None:
                log.error("第%s页获取失败" % page_count)
                break

            # 获取页面中的所有图片信息列表
            image_info_list = re.findall('<img src="" data-original="([^"]*)" class="lazy img" title="([^"]*)">', page_data)
            # 获取页面中的影片数量
            page_data_count = page_data.count('<div class="item pull-left">')

            # 已经下载完毕了
            if page_data_count == 0:
                break
            log.step("第%s页，影片数量%s，获取到的封面图片数量%s" % (page_count, len(image_info_list), page_data_count))

            for small_image_url, title in image_info_list:
                # 达到线程上限，等待
                while thread_type == 1 and threading.activeCount() >= self.thread_count + main_thread_count:
                    time.sleep(5)

                title = robot.filter_text(str(title)).upper()
                image_url = get_large_image_url(small_image_url)
                if image_url is None:
                    log.trace("%s的封面图片大图地址获取失败" % title)
                    continue

                log.step("开始下载%s的封面图片 %s" % (title, image_url))

                file_type = image_url.split(".")[-1]
                file_path = os.path.join(self.image_download_path, "%s.%s" % (title, file_type))
                file_temp_path = os.path.join(self.image_download_path, "%s_temp.%s" % (title, file_type))

                # 开始下载
                thread = Download(self.thread_lock, title, file_path, file_temp_path, image_url)
                thread.start()
                time.sleep(0.1)

            # 还有未完成线程
            while thread_type == 2 and threading.activeCount() > main_thread_count:
                time.sleep(5)

            page_count += 1

        log.step("全部下载完毕，耗时%s秒，共计图片%s张" % (self.get_run_time(), TOTAL_IMAGE_COUNT))

Example #2

Show file

File: meituzz.py Project: yxw19870806/yxw19870806

    def main(self):
        # 解析存档文件，获取上一次的album id
        album_id = 1
        if os.path.exists(self.save_data_path):
            save_file = open(self.save_data_path, "r")
            save_info = save_file.read()
            save_file.close()
            album_id = int(save_info.strip())

        total_image_count = 0
        total_video_count = 0
        error_count = 0
        is_over = False
        while not is_over:
            album_url = "http://meituzz.com/album/browse?albumID=%s" % album_id
            try:
                album_page_return_code, album_page = tool.http_request(album_url)[:2]
            except SystemExit:
                log.step("提前退出")
                break

            if album_page_return_code == -500:
                log.error("第%s页相册内部错误" % album_id)
                album_id += 1
                continue
            elif album_page_return_code != 1:
                log.error("第%s页图片获取失败" % album_id)
                break

            if album_page.find("<title>相册已被删除</title>") >= 0:
                error_count += 1
                if error_count >= ERROR_PAGE_COUNT_CHECK:
                    log.error("连续%s页相册没有图片，退出程序" % ERROR_PAGE_COUNT_CHECK)
                    album_id -= error_count - 1
                    break
                else:
                    log.error("第%s页相册已被删除" % album_id)
                    album_id += 1
                    continue
            # 错误数量重置
            error_count = 0

            # 图片下载
            if self.is_download_image and album_page.find('<input type="hidden" id="imageList"') >= 0:
                total_photo_count = tool.find_sub_string(album_page, '<input type="hidden" id="totalPageNum" value=', ' />')
                if not total_photo_count:
                    log.error("第%s页图片数量解析失败" % album_id)
                    break
                total_photo_count = int(total_photo_count)

                # 获取页面全部图片地址列表
                image_url_list = get_image_url_list(album_page)
                if image_url_list is None:
                    log.error("第%s页图片地址列表解析失败" % album_id)
                    break

                if len(image_url_list) == 0:
                    log.error("第%s页没有获取到图片" % album_id)
                    break

                is_fee = False
                if len(image_url_list) != total_photo_count:
                    album_reward_find = re.findall('<input type="hidden" id="rewardAmount" value="(\d*)">', album_page)
                    if len(album_reward_find) == 1:
                        album_reward = int(album_reward_find[0])
                        if album_reward > 0 and total_photo_count - len(image_url_list) <= 1:
                            is_fee = True
                    if not is_fee:
                        log.error("第%s页解析获取的图片数量不符" % album_id)
                        # break

                image_path = os.path.join(self.image_download_path, "%04d" % album_id)
                if not tool.make_dir(image_path, 0):
                    log.error("创建图片下载目录 %s 失败" % image_path)
                    break

                image_count = 1
                for image_url in image_url_list:
                    # 去除模糊效果
                    image_url = str(image_url).split("@")[0]
                    log.step("开始下载第%s页第%s张图片 %s" % (album_id, image_count, image_url))

                    image_file_path = os.path.join(image_path, "%04d.jpg" % image_count)
                    try:
                        if tool.save_net_file(image_url, image_file_path, True):
                            log.step("第%s页第%s张图片下载成功" % (album_id, image_count))
                            image_count += 1
                        else:
                            log.error("第%s页第%s张图片 %s 下载失败" % (album_id, image_count, image_url))
                    except SystemExit:
                        log.step("提前退出")
                        tool.remove_dir(image_path)
                        is_over = True
                        break

                total_image_count += image_count - 1

            # 视频下载
            if self.is_download_image and album_page.find('<input type="hidden" id="VideoUrl"') >= 0:
                # 获取视频下载地址
                video_url = get_video_url(album_page)
                log.step("开始下载第%s页视频 %s" % (album_id, video_url))

                video_title = robot.filter_text(tool.find_sub_string(album_page, "<title>", "</title>"))
                file_type = video_url.split(".")[-1]
                video_file_path = os.path.join(self.video_download_path, "%s %s.%s" % (album_id, video_title, file_type))
                try:
                    if tool.save_net_file(video_url, video_file_path, True):
                        log.step("第%s页视频下载成功" % album_id)
                        total_video_count += 1
                    else:
                        log.error("第%s页视频 %s 下载失败" % (album_id, video_url))
                except SystemExit:
                    log.step("提前退出")
                    is_over = True

            if not is_over:
                album_id += 1

        # 重新保存存档文件
        save_data_dir = os.path.dirname(self.save_data_path)
        if not os.path.exists(save_data_dir):
            tool.make_dir(save_data_dir, 0)
        save_file = open(self.save_data_path, "w")
        save_file.write(str(album_id))
        save_file.close()

        log.step("全部下载完毕，耗时%s秒，共计图片%s张，视频%s个" % (self.get_run_time(), total_image_count, total_video_count))

Example #3

Show file

File: tuchong.py Project: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT

        account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            if account_name.isdigit():
                site_id = account_name
            else:
                site_id = get_site_id(account_name)
            if site_id is None:
                log.error(account_name + " 主页无法访问")
                tool.process_exit()

            if not site_id:
                log.error(account_name + " site id解析失败")
                tool.process_exit()

            image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)

            this_account_total_image_count = 0
            post_count = 0
            first_post_id = "0"
            post_time = "2016-11-16 14:12:00"
            is_over = False
            while not is_over:
                # 获取一页的相册信息列表
                post_info_list = get_one_page_post_info_list(site_id, post_time)
                if post_info_list is None:
                    log.error(account_name + " 相册信息列表无法访问")
                    tool.process_exit()

                # 如果为空，表示已经取完了
                if len(post_info_list) == 0:
                    break

                for post_info in post_info_list:
                    if not robot.check_sub_key(("title", "post_id", "published_at", "images"), post_info):
                        log.error(account_name + " 相册信息解析失败：%s" % post_info)
                        continue

                    post_id = str(post_info["post_id"])

                    # 检查信息页id是否小于上次的记录
                    if int(post_id) <= int(self.account_info[1]):
                        is_over = True
                        break

                    # 将第一个信息页的id做为新的存档记录
                    if first_post_id == "0":
                        first_post_id = post_id

                    # 过滤标题中不支持的字符
                    title = robot.filter_text(post_info["title"])
                    if title:
                        post_path = os.path.join(image_path, "%s %s" % (post_id, title))
                    else:
                        post_path = os.path.join(image_path, post_id)
                    if not tool.make_dir(post_path, 0):
                        # 目录出错，把title去掉后再试一次，如果还不行退出
                        log.error(account_name + " 创建相册目录 %s 失败，尝试不使用title" % post_path)
                        post_path = os.path.join(image_path, post_id)
                        if not tool.make_dir(post_path, 0):
                            log.error(account_name + " 创建相册目录 %s 失败" % post_path)
                            tool.process_exit()

                    image_count = 0
                    for image_info in post_info["images"]:
                        image_count += 1
                        if not robot.check_sub_key(("img_id",), image_info):
                            log.error(account_name + " 相册%s 第%s张图片解析失败" % (post_id, image_count))
                            continue
                        image_url = generate_large_image_url(site_id, image_info["img_id"])
                        log.step(account_name + " 相册%s 开始下载第%s张图片 %s" % (post_id, image_count, image_url))

                        file_path = os.path.join(post_path, "%s.jpg" % image_count)
                        if tool.save_net_file(image_url, file_path):
                            log.step(account_name + " 相册%s 第%s张图片下载成功" % (post_id, image_count))
                        else:
                            log.error(account_name + " 相册%s 第%s张图片 %s 下载失败" % (post_info["post_id"], image_count, image_url))
                    this_account_total_image_count += image_count

                    if not is_over:
                        # 达到配置文件中的下载页数，结束
                        if 0 < GET_PAGE_COUNT < post_count:
                            is_over = True
                        else:
                            # 相册发布时间
                            post_time = post_info["published_at"]
                            post_count += 1

            log.step(account_name + " 下载完毕，总共获得%s张图片" % this_account_total_image_count)

            # 新的存档记录
            if first_post_id != "0":
                self.account_info[1] = first_post_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += this_account_total_image_count
            ACCOUNTS.remove(account_name)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

Example #4

Show file

File: bcy.py Project: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT

        coser_id = self.account_info[0]
        if len(self.account_info) >= 3:
            cn = self.account_info[2]
        else:
            cn = self.account_info[0]

        try:
            log.step(cn + " 开始")

            image_path = os.path.join(IMAGE_DOWNLOAD_PATH, cn)

            # 图片下载
            this_cn_total_image_count = 0
            page_count = 1
            total_rp_count = 1
            first_rp_id = ""
            unique_list = []
            is_over = False
            need_make_download_dir = True  # 是否需要创建cn目录
            while not is_over:
                # 获取一页的作品信息
                post_page = get_one_page_post(coser_id, page_count)
                if post_page is None:
                    log.error(cn + " 无法访问第%s页作品" % page_count)
                    tool.process_exit()

                # 解析作品信息，获取所有的正片信息
                cp_id, rp_list = get_rp_list(post_page)
                if cp_id is None:
                    log.error(cn + " 第%s页作品解析异常" % page_count)
                    tool.process_exit()

                for rp_id, title in rp_list.iteritems():
                    # 检查是否已下载到前一次的图片
                    if int(rp_id) <= int(self.account_info[1]):
                        is_over = True
                        break

                    # 新增正片导致的重复判断
                    if rp_id in unique_list:
                        continue
                    else:
                        unique_list.append(rp_id)
                    # 将第一个作品的id做为新的存档记录
                    if first_rp_id == "":
                        first_rp_id = rp_id

                    log.trace("rp: " + rp_id)

                    if need_make_download_dir:
                        if not tool.make_dir(image_path, 0):
                            log.error(cn + " 创建CN目录 %s 失败" % image_path)
                            tool.process_exit()
                        need_make_download_dir = False

                    # 过滤标题中不支持的字符
                    title = robot.filter_text(title)
                    if title:
                        rp_path = os.path.join(image_path, "%s %s" % (rp_id, title))
                    else:
                        rp_path = os.path.join(image_path, rp_id)
                    if not tool.make_dir(rp_path, 0):
                        # 目录出错，把title去掉后再试一次，如果还不行退出
                        log.error(cn + " 创建作品目录 %s 失败，尝试不使用title" % rp_path)
                        rp_path = os.path.join(image_path, rp_id)
                        if not tool.make_dir(rp_path, 0):
                            log.error(cn + " 创建作品目录 %s 失败" % rp_path)
                            tool.process_exit()

                    # 获取正片页面内的所有图片地址列表
                    image_url_list = get_image_url_list(cp_id, rp_id)
                    if image_url_list is None:
                        log.error(cn + " 无法访问正片：%s，cp_id：%s" % (rp_id, cp_id))
                        continue

                    if len(image_url_list) == 0 and IS_AUTO_FOLLOW:
                        log.step(cn + " 检测到可能有私密作品且账号不是ta的粉丝，自动关注")
                        if follow(coser_id):
                            # 重新获取下正片页面内的所有图片地址列表
                            image_url_list = get_image_url_list(cp_id, rp_id)

                    if len(image_url_list) == 0:
                        log.error(cn + " 正片：%s没有任何图片，可能是你使用的账号没有关注ta，所以无法访问只对粉丝开放的私密作品，cp_id：%s" % (rp_id, cp_id))
                        continue

                    image_count = 1
                    for image_url in list(image_url_list):
                        # 禁用指定分辨率
                        image_url = "/".join(image_url.split("/")[0:-1])
                        log.step(cn + " %s 开始下载第%s张图片 %s" % (rp_id, image_count, image_url))

                        if image_url.rfind("/") < image_url.rfind("."):
                            file_type = image_url.split(".")[-1]
                        else:
                            file_type = "jpg"
                        file_path = os.path.join(rp_path, "%03d.%s" % (image_count, file_type))
                        if tool.save_net_file(image_url, file_path):
                            image_count += 1
                            log.step(cn + " %s 第%s张图片下载成功" % (rp_id, image_count))
                        else:
                            log.error(cn + " %s 第%s张图片 %s 下载失败" % (rp_id, image_count, image_url))

                    this_cn_total_image_count += image_count - 1

                    if 0 < GET_PAGE_COUNT < total_rp_count:
                        is_over = True
                        break
                    else:
                        total_rp_count += 1

                if not is_over:
                    if page_count >= get_max_page_count(coser_id, post_page):
                        is_over = True
                    else:
                        page_count += 1

            log.step(cn + " 下载完毕，总共获得%s张图片" % this_cn_total_image_count)

            # 新的存档记录
            if first_rp_id != "":
                self.account_info[1] = first_rp_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += this_cn_total_image_count
            ACCOUNTS.remove(coser_id)
            self.thread_lock.release()

            log.step(cn + " 完成")
        except SystemExit:
            log.error(cn + " 异常退出")
        except Exception, e:
            log.error(cn + " 未知异常")
            log.error(str(e) + "\n" + str(traceback.format_exc()))

Example #5

Show file

File: nicoNico.py Project: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_VIDEO_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 3 and self.account_info[2]:
            account_name = self.account_info[2]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                video_path = os.path.join(VIDEO_TEMP_PATH, account_name)
            else:
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)

            # 获取视频信息列表
            video_info_list = get_video_info_list(account_id)
            if video_info_list is None:
                log.error(account_name + " 视频列表获取失败")
                tool.process_exit()

            video_count = 1
            first_video_id = "0"
            need_make_video_dir = True
            for video_info in video_info_list:
                if not robot.check_sub_key(("item_data",), video_info) or \
                        not robot.check_sub_key(("watch_id", "title"), video_info["item_data"]):
                    log.error(account_name + " 视频信息%s解析失败" % video_info)
                    tool.process_exit()

                # sm30043563
                video_id = str(video_info["item_data"]["watch_id"])

                # 过滤标题中不支持的字符
                video_title = robot.filter_text(video_info["item_data"]["title"])

                # 第一个视频，创建目录
                if need_make_video_dir:
                    if not tool.make_dir(video_path, 0):
                        log.error(account_name + " 创建图片下载目录 %s 失败" % video_path)
                        tool.process_exit()
                    need_make_video_dir = False

                # 获取视频下载地址
                video_url = get_video_url(video_id)
                log.step(account_name + " 开始下载第%s个视频 %s %s" % (video_count, video_id, video_url))
                print video_title
                print "%s %s" % (video_id, video_title)
                file_path = os.path.join(video_path, "%s %s.mp4" % (video_id, video_title))
                if tool.save_net_file(video_url, file_path):
                    log.step(account_name + " 第%s个视频下载成功" % video_count)
                    video_count += 1
                else:
                    log.error(account_name + " 第%s个视频 %s %s 下载失败" % (video_count, video_id, video_url))

            log.step(account_name + " 下载完毕，总共获得%s个视频" % (video_count - 1))

            # 排序
            if IS_SORT:
                if first_video_id != "0":
                    destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(video_path, destination_path, int(self.account_info[3]), 4):
                        log.step(account_name + " 视频从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            # 新的存档记录
            if first_video_id != "0":
                self.account_info[1] = first_video_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

Example #6

Show file

File: article.py Project: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 3 and self.account_info[2]:
            account_name = self.account_info[2]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 获取账号对应的page_id
            account_page_id = get_account_page_id(account_id)
            if account_page_id is None:
                log.error(account_name + " 微博主页没有获取到page_id")
                tool.process_exit()

            page_count = 1
            this_account_total_image_count = 0
            first_article_time = "0"
            is_over = False
            image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
            while not is_over:
                # 获取一页文章预览页面
                preview_article_page = get_one_page_preview_article_data(account_page_id, page_count)

                if preview_article_page is None:
                    log.error(account_name + " 第%s页文章获取失败" % page_count)
                    tool.process_exit()

                # 将文章预览页面内容分组
                preview_article_data_list = get_preview_article_data_list(preview_article_page)
                if len(preview_article_data_list) == 0:
                    log.error(account_name + " 第%s页文章解析失败，页面：%s" % (page_count, preview_article_page))
                    tool.process_exit()

                for preview_article_data in preview_article_data_list:
                    # 获取文章的发布时间
                    article_time = get_article_time(preview_article_data)
                    if article_time is None:
                        log.error(account_name + " 预览 %s 中的文章发布时间解析失败" % preview_article_data)
                        continue

                    # 检查是否是上一次的最后视频
                    if article_time <= int(self.account_info[1]):
                        is_over = True
                        break

                    # 将第一个视频的地址做为新的存档记录
                    if first_article_time == "0":
                        first_article_time = str(article_time)

                    # 获取文章地址
                    article_url = get_article_url(preview_article_data)
                    if article_url is None:
                        log.error(account_name + " 预览 %s 中的文章地址解析失败" % preview_article_data)
                        continue

                    # 获取文章id
                    article_id = get_article_id(article_url)
                    if article_id is None:
                        log.error(account_name + " 文章地址 %s 解析文章id失败" % article_url)
                        continue

                    # 获取文章页面内容
                    article_page = auto_redirect_visit(article_url)
                    if not article_page:
                        log.error(account_name + " 文章 %s 获取失败" % article_url)
                        continue

                    # 文章标题
                    title = get_article_title(article_page, article_id[0])
                    # 过滤标题中不支持的字符
                    title = robot.filter_text(title)
                    if title:
                        article_path = os.path.join(image_path, "%s %s" % (article_id, title))
                    else:
                        article_path = os.path.join(image_path, article_id)
                    if not tool.make_dir(article_path, 0):
                        # 目录出错，把title去掉后再试一次，如果还不行退出
                        log.error(account_name + " 创建文章目录 %s 失败，尝试不使用title" % article_path)
                        article_path = os.path.join(image_path, article_id)
                        if not tool.make_dir(article_path, 0):
                            log.error(account_name + " 创建文章目录 %s 失败" % article_path)
                            tool.process_exit()

                    # 文章顶部图片
                    top_picture_url = get_article_top_picture_url(article_page)
                    if top_picture_url:
                        log.step(account_name + " %s 开始下载顶部图片 %s" % (title, top_picture_url))

                        file_type = top_picture_url.split(".")[-1]
                        file_path = os.path.join(article_path, "0000.%s" % file_type)
                        if tool.save_net_file(top_picture_url, file_path):
                            log.step(account_name + " %s 顶部图片下载成功" % title)
                            this_account_total_image_count += 1
                        else:
                            log.error(account_name + " %s 顶部图片 %s 下载失败" % (title, top_picture_url))

                    # 获取文章正文的图片地址列表
                    image_url_list = get_article_image_url_list(article_page, article_id[0])
                    if image_url_list is None:
                        log.error(account_name + " 文章 %s 正文解析失败" % article_url)
                        continue

                    image_count = 1
                    for image_url in list(image_url_list):
                        if image_url.find("/p/e_weibo_com") >= 0 or image_url.find("e.weibo.com") >= 0:
                            continue
                        log.step(account_name + " %s 开始下载第%s张图片 %s" % (title, image_count, image_url))

                        file_type = image_url.split(".")[-1]
                        file_path = os.path.join(article_path, "%s.%s" % (image_count, file_type))
                        if tool.save_net_file(image_url, file_path):
                            log.step(account_name + " %s 第%s张图片下载成功" % (title, image_count))
                            image_count += 1
                        else:
                            log.error(account_name + " %s 第%s张图片 %s 下载失败" % (title, image_count, image_url))

                    if image_count > 1:
                        this_account_total_image_count += image_count - 1

                if not is_over:
                    # 获取文章总页数
                    if page_count >= get_max_page_count(preview_article_page):
                        is_over = True
                    else:
                        page_count += 1

            log.step(account_name + " 下载完毕，总共获得%s张图片" % this_account_total_image_count)

            # 新的存档记录
            if first_article_time != "0":
                self.account_info[1] = first_article_time

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += this_account_total_image_count
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

Example #7

Show file

File: fiveSing.py Project: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_VIDEO_COUNT
        global GET_PAGE_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 4 and self.account_info[3]:
            account_name = self.account_info[3]
        else:
            account_name = self.account_info[0]

        # 原创、翻唱
        audio_type_to_index = {"yc": 1, "fc": 2}
        try:
            log.step(account_name + " 开始")

            video_count = 1
            for audio_type in audio_type_to_index.keys():
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name, audio_type)

                page_count = 1
                first_audio_id = "0"
                unique_list = []
                is_over = False
                need_make_download_dir = True
                while not is_over:
                    # 获取指定一页的歌曲信息列表
                    audio_list = get_one_page_audio_list(account_id, audio_type, page_count)

                    if audio_list is None:
                        log.step(account_name + " 第%s页%s歌曲页面获取失败" % (page_count, audio_type))
                        first_audio_id = "0"
                        break  # 存档恢复

                    # 如果为空，表示已经取完了
                    if len(audio_list) == 0:
                        break

                    for audio_info in list(audio_list):
                        audio_id = audio_info[0]
                        # 过滤标题中不支持的字符
                        audio_title = robot.filter_text(audio_info[1])

                        # 检查是否歌曲id小于上次的记录
                        if int(audio_id) <= int(self.account_info[audio_type_to_index[audio_type]]):
                            is_over = True
                            break

                        # 新增歌曲导致的重复判断
                        if audio_id in unique_list:
                            continue
                        else:
                            unique_list.append(audio_id)
                        # 将第一首歌曲id做为新的存档记录
                        if first_audio_id == "0":
                            first_audio_id = str(audio_id)

                        # 获取歌曲的下载地址
                        audio_url = get_audio_url(audio_id, audio_type_to_index[audio_type])
                        if audio_url is None:
                            log.step(account_name + " %s歌曲ID %s，下载地址获取失败" % (audio_type, audio_id))
                            continue
                        if not audio_url:
                            log.step(account_name + " %s歌曲ID %s，暂不提供下载地址" % (audio_type, audio_id))
                            continue

                        log.step(account_name + " 开始下载第%s首歌曲 %s" % (video_count, audio_url))

                        # 第一首歌曲，创建目录
                        if need_make_download_dir:
                            if not tool.make_dir(video_path, 0):
                                log.error(account_name + " 创建歌曲下载目录 %s 失败" % video_path)
                                tool.process_exit()
                            need_make_download_dir = False

                        file_path = os.path.join(video_path, "%s - %s.mp3" % (audio_id, audio_title))
                        if tool.save_net_file(audio_url, file_path):
                            log.step(account_name + " 第%s首歌曲下载成功" % video_count)
                            video_count += 1
                        else:
                            log.error(account_name + " 第%s首歌曲 %s 下载失败" % (video_count, audio_url))

                        # 达到配置文件中的下载数量，结束
                        if 0 < GET_VIDEO_COUNT < video_count:
                            is_over = True
                            break

                    if not is_over:
                        # 达到配置文件中的下载页数，结束
                        if 0 < GET_PAGE_COUNT <= page_count:
                            is_over = True
                        # 获取的歌曲数量少于1页的上限，表示已经到结束了
                        # 如果歌曲数量正好是页数上限的倍数，则由下一页获取是否为空判断
                        elif len(audio_list) < 20:
                            is_over = True
                        else:
                            page_count += 1

                # 新的存档记录
                if first_audio_id != "0":
                    self.account_info[audio_type_to_index[audio_type]] = first_audio_id

            log.step(account_name + " 下载完毕，总共获得%s首歌曲" % (video_count - 1))

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

Example #8

Show file

File: zunguang.py Project: yxw19870806/yxw19870806

    def main(self):
        # 解析存档文件，获取上一次的album id
        page_count = 1
        if os.path.exists(self.save_data_path):
            save_file = open(self.save_data_path, "r")
            save_info = save_file.read()
            save_file.close()
            page_count = int(save_info.strip())

        total_image_count = 0
        error_count = 0
        is_over = False
        while not is_over:
            album_status, album_data = get_one_page_album_data(page_count)

            if album_status == -1:
                log.error("第%s页相册获取失败" % page_count)
                break
            elif album_status == -2:
                log.error("第%s页相册解析失败" % page_count)
                break
            elif album_status == 2:
                error_count += 1
                if error_count >= ERROR_PAGE_COUNT_CHECK:
                    log.error("连续%s页相册没有图片，退出程序" % ERROR_PAGE_COUNT_CHECK)
                    page_count -= error_count - 1
                    break
                else:
                    log.error("第%s页相册已被删除" % page_count)
                    page_count += 1
                    continue
            elif album_status == 3:
                log.error("第%s页歌曲相册" % page_count)
                page_count += 1
                continue
            elif album_status == 4:
                log.error("第%s页相册未知相册类型%s" % (page_count, album_data))
                break
            # 错误数量重置
            error_count = 0

            # 下载目录标题
            title = ""
            if album_data["title"]:
                # 过滤标题中不支持的字符
                title = robot.filter_text(str(album_data["title"].encode("utf-8")))
            if title:
                image_path = os.path.join(self.image_download_path, "%04d %s" % (page_count, title))
            else:
                image_path = os.path.join(self.image_download_path, "%04d" % page_count)
            if not tool.make_dir(image_path, 0):
                # 目录出错，把title去掉后再试一次，如果还不行退出
                log.error("第%s页创建相册目录 %s 失败，尝试不使用title" % (page_count, image_path))
                post_path = os.path.join(image_path, page_count)
                if not tool.make_dir(post_path, 0):
                    log.error("第%s页创建相册目录 %s 失败" % (page_count, image_path))
                    tool.process_exit()

            image_count = 1
            for image_data in album_data["attr"]["img"]:
                image_url = "http://www.zunguang.com/%s" % str(image_data["url"])
                log.step("开始下载第%s页第%s张图片 %s" % (page_count, image_count, image_url))

                file_type = image_url.split(".")[-1]
                file_path = os.path.join(image_path, "%03d.%s" % (image_count, file_type))
                try:
                    if tool.save_net_file(image_url, file_path, True):
                        log.step("第%s页第%s张图片下载成功" % (page_count, image_count))
                        image_count += 1
                    else:
                        log.error("第%s页第%s张图片 %s 下载失败" % (page_count, image_count, image_url))
                except SystemExit:
                    log.step("提前退出")
                    tool.remove_dir(image_path)
                    is_over = True
                    break

            if not is_over:
                total_image_count += image_count - 1
                page_count += 1

        # 重新保存存档文件
        save_data_dir = os.path.dirname(self.save_data_path)
        if not os.path.exists(save_data_dir):
            tool.make_dir(save_data_dir, 0)
        save_file = open(self.save_data_path, "w")
        save_file.write(str(page_count))
        save_file.close()

        log.step("全部下载完毕，耗时%s秒，共计图片%s张" % (self.get_run_time(), total_image_count))