Python process_exit 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: common.tool

메소드/함수: process_exit

hotexamples.com에서의 예제들: 46

Python process_exit - 46개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 common.tool.process_exit에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: crawler.py 프로젝트: yxw19870806/Py3CrawlerLib

def read_save_data(save_data_path,
                   key_index=0,
                   default_value_list=[],
                   check_duplicate_index=True):
    result_list = {}
    if not os.path.exists(save_data_path):
        return result_list
    for single_save_data in file.read_file(save_data_path,
                                           file.READ_FILE_TYPE_LINE):
        single_save_data = single_save_data.replace("\n", "").replace("\r", "")
        if len(single_save_data) == 0:
            continue
        single_save_list = single_save_data.split("\t")

        if check_duplicate_index and single_save_list[key_index] in result_list:
            output.print_msg("存档中存在重复行 %s" % single_save_list[key_index])
            tool.process_exit()

        # 去除前后空格
        single_save_list = [value.strip() for value in single_save_list]

        # 根据default_value_list给没给字段默认值
        index = 0
        for default_value in default_value_list:
            # _开头表示和该数组下标的值一直，如["", "_0"] 表示第1位为空时数值和第0位一致
            if default_value != "" and default_value[0] == "_":
                default_value = single_save_list[int(
                    default_value.replace("_", ""))]
            if len(single_save_list) <= index:
                single_save_list.append(default_value)
            if single_save_list[index] == "":
                single_save_list[index] = default_value
            index += 1
        result_list[single_save_list[key_index]] = single_save_list
    return result_list

예제 #2

파일 보기

def get_game_invalid_achievements(game_id):
    game_index_url = "http://astats.astats.nl/astats/Steam_Game_Info.php"
    query_data = {"AppID": game_id}
    game_index_response = net.http_request(game_index_url,
                                           method="GET",
                                           fields=query_data)
    if game_index_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        output.print_msg("游戏 %s 访问失败" % game_id)
        tool.process_exit()
    # game id 不存在
    if game_index_response.data.find(
            "This game cannot be found in the database.") >= 0:
        return
    achievement_text = tool.find_sub_string(
        game_index_response.data,
        '<span class="GameInfoBoxRow">Achievements</span><br>', "</td>")
    # 没有成就
    if not achievement_text:
        return
    achievement_text = achievement_text.strip()
    if not crawler.is_integer(achievement_text):
        invalid_achievement_text = tool.find_sub_string(
            achievement_text, '<font color="#FF0000">', "</font>")
        if invalid_achievement_text:
            output.print_msg("游戏 %s, 存在无效成就，%s" %
                             (game_id, invalid_achievement_text))
        else:
            output.print_msg("游戏 %s, 存在未知成就文字：%s" %
                             (game_id, invalid_achievement_text))

예제 #3

파일 보기

파일: crawler.py 프로젝트: yxw19870806/Py3Crawler

    def __init__(self, single_save_data: list, main_thread: Crawler):
        """
        多线程下载

        :Args:
        - single_save_data - 线程用到的数据
        - main_thread - 主线程对象
        """
        if not isinstance(main_thread, Crawler):
            output.print_msg("下载线程参数异常")
            tool.process_exit()
        try:
            threading.Thread.__init__(self)
            self.single_save_data = single_save_data
            self.main_thread = main_thread
            self.thread_lock = main_thread.thread_lock
            main_thread.thread_semaphore.acquire()
        except KeyboardInterrupt:
            self.main_thread.stop_process()
        self.total_photo_count = 0
        self.total_video_count = 0
        self.total_audio_count = 0
        self.total_content_count = 0
        self.temp_path_list = []
        if single_save_data:
            self.step("开始")

예제 #4

파일 보기

파일: crawler.py 프로젝트: yxw19870806/Py3Crawler

 def main_thread_check(self):
     """
     检测主线程是否已经结束（外部中断）
     """
     if not self.main_thread.is_running():
         self.notify_main_thread()
         tool.process_exit(tool.PROCESS_EXIT_CODE_NORMAL)

예제 #5

파일 보기

def read_save_data(save_data_path, key_index, default_value_list):
    result_list = {}
    if not os.path.exists(path.change_path_encoding(save_data_path)):
        return result_list
    for single_save_data in tool.read_file(save_data_path, tool.READ_FILE_TYPE_LINE):
        single_save_data = single_save_data.replace("\xef\xbb\xbf", "").replace("\n", "").replace("\r", "")
        if len(single_save_data) == 0:
            continue
        single_save_list = single_save_data.split("\t")

        if single_save_list[key_index] in result_list:
            output.print_msg("存档中存在重复行 %s" % single_save_list[key_index])
            tool.process_exit()

        # 去除前后空格
        single_save_list = map(lambda value: value.strip(), single_save_list)

        # 根据default_value_list给没给字段默认值
        index = 0
        for default_value in default_value_list:
            # _开头表示和该数组下标的值一直，如["", "_0"] 表示第1位为空时数值和第0位一致
            if default_value != "" and default_value[0] == "_":
                default_value = single_save_list[int(default_value.replace("_", ""))]
            if len(single_save_list) <= index:
                single_save_list.append(default_value)
            if single_save_list[index] == "":
                single_save_list[index] = default_value
            index += 1
        result_list[single_save_list[key_index]] = single_save_list
    return result_list

예제 #6

파일 보기

파일: crawler.py 프로젝트: yxw19870806/Py3Crawler

 def check_download_failure_exit(self, is_process_exit=True):
     """
     当下载失败，检测是否要退出线程
     """
     if self.main_thread.is_thread_exit_after_download_failure:
         if is_process_exit:
             tool.process_exit(tool.PROCESS_EXIT_CODE_ERROR)
         else:
             return True
     return False

예제 #7

파일 보기

파일: instagram.py 프로젝트: yxw19870806/yxw19870806

    def main(self):
        global ACCOUNTS

        # 解析存档文件
        # account_name  image_count  video_count  last_created_time
        account_list = robot.read_save_data(self.save_data_path, 0, ["", "0", "0", "0"])
        ACCOUNTS = account_list.keys()

        if not set_csrf_token():
            log.error("token和session获取查找失败")
            tool.process_exit()

        # 循环下载每个id
        main_thread_count = threading.activeCount()
        for account_name in sorted(account_list.keys()):
            # 检查正在运行的线程数
            while threading.activeCount() >= self.thread_count + main_thread_count:
                if robot.is_process_end() == 0:
                    time.sleep(10)
                else:
                    break

            # 提前结束
            if robot.is_process_end() > 0:
                break

            # 开始下载
            thread = Download(account_list[account_name], self.thread_lock)
            thread.start()

            time.sleep(1)

        # 检查除主线程外的其他所有线程是不是全部结束了
        while threading.activeCount() > main_thread_count:
            time.sleep(10)

        # 未完成的数据保存
        if len(ACCOUNTS) > 0:
            new_save_data_file = open(NEW_SAVE_DATA_PATH, "a")
            for account_name in ACCOUNTS:
                new_save_data_file.write("\t".join(account_list[account_name]) + "\n")
            new_save_data_file.close()

        # 删除临时文件夹
        self.finish_task()

        # 重新排序保存存档文件
        robot.rewrite_save_file(NEW_SAVE_DATA_PATH, self.save_data_path)

        log.step("全部下载完毕，耗时%s秒，共计图片%s张，视频%s个" % (self.get_run_time(), TOTAL_IMAGE_COUNT, TOTAL_VIDEO_COUNT))

예제 #8

파일 보기

파일: bcy.py 프로젝트: yxw19870806/yxw19870806

def check_login():
    home_page_url = "http://bcy.net/home/user/index"
    home_page_return = tool.http_request(home_page_url)
    if home_page_return[0] == 1:
        real_url = home_page_return[2].geturl()
        if (home_page_url != real_url) or ("http://bcy.net/start" == real_url):
            is_check_ok = False
            while not is_check_ok:
                input_str = raw_input(tool.get_time() + " 没有检测到您的账号信息，可能无法获取那些只对粉丝开放的隐藏作品，是否继续下一步操作？ (Y)es or (N)o: ")
                input_str = input_str.lower()
                if input_str in ["y", "yes"]:
                    is_check_ok = True
                elif input_str in ["n", "no"]:
                    tool.process_exit()

예제 #9

파일 보기

파일: article.py 프로젝트: yxw19870806/yxw19870806

def auto_redirect_visit(url):
    page_return_code, page_response = tool.http_request(url)[:2]
    if page_return_code == 1:
        # 有重定向
        redirect_url_find = re.findall('location.replace\(["|\']([^"|^\']*)["|\']\)', page_response)
        if len(redirect_url_find) == 1:
            return auto_redirect_visit(redirect_url_find[0])
        # 没有cookies无法访问的处理
        if page_response.find("用户名或密码错误") != -1:
            log.error("登陆状态异常，请在浏览器中重新登陆微博账号")
            tool.process_exit()
        # 返回页面
        if page_response:
            return str(page_response)
    return False

예제 #10

파일 보기

파일: crawler.py 프로젝트: yxw19870806/Py3Crawler

def read_save_data(save_data_path: str,
                   key_index: int = 0,
                   default_value_list: list = None,
                   check_duplicate_index: bool = True) -> dict:
    """
    读取存档文件，并根据指定列生成存档字典

    :Args:
    - save_data_path - 存档路径
    - key_index - 配置文件的主键（唯一）
    - default_value_list - 每一位的默认值
    - check_duplicate_index - 是否检测主键的唯一性
    """
    if default_value_list is None:
        default_value_list = []
    result_list = {}
    if not os.path.exists(save_data_path):
        return result_list
    for single_save_data in file.read_file(save_data_path,
                                           file.READ_FILE_TYPE_LINE):
        single_save_data = single_save_data.replace("\n", "").replace("\r", "")
        if len(single_save_data) == 0:
            continue
        single_save_list = single_save_data.split("\t")

        if check_duplicate_index and single_save_list[key_index] in result_list:
            output.print_msg(f"存档中存在重复行{single_save_list[key_index]}")
            tool.process_exit()

        # 去除前后空格
        single_save_list = [value.strip() for value in single_save_list]

        # 根据default_value_list给没给字段默认值
        index = 0
        for default_value in default_value_list:
            # _开头表示和该数组下标的值一直，如["", "_0"] 表示第1位为空时数值和第0位一致
            if default_value != "" and default_value[0] == "_":
                default_value = single_save_list[int(
                    default_value.replace("_", ""))]
            if len(single_save_list) <= index:
                single_save_list.append(default_value)
            if single_save_list[index] == "":
                single_save_list[index] = default_value
            index += 1
        result_list[single_save_list[key_index]] = single_save_list
    return result_list

예제 #11

파일 보기

    def __init__(self, account_info, main_thread):
        """
        :param account_info:

        :param main_thread:
            object of main thread(class Crawler)
        """
        threading.Thread.__init__(self)
        self.account_info = account_info
        if isinstance(main_thread, Crawler):
            self.main_thread = main_thread
            self.thread_lock = main_thread.thread_lock
            main_thread.thread_semaphore.acquire()
        else:
            output.print_msg("下载线程参数异常")
            tool.process_exit()
        self.total_photo_count = 0
        self.total_video_count = 0
        self.total_audio_count = 0
        self.temp_path_list = []

예제 #12

파일 보기

    def __init__(self, account_info, main_thread):
        """
        :param account_info:

        :param thread_lock:
            threading.Lock() object in main thread

        :param thread_event:
            threading.Event() object in main thread, flag of process is running
        """
        threading.Thread.__init__(self)
        self.account_info = account_info
        if isinstance(main_thread, Crawler):
            self.main_thread = main_thread
            self.thread_lock = main_thread.thread_lock
        else:
            output.print_msg("下载线程参数异常")
            tool.process_exit()
        self.total_image_count = 0
        self.total_video_count = 0
        self.temp_path_list = []

예제 #13

파일 보기

파일: tumblr.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT
        global TOTAL_VIDEO_COUNT

        account_id = self.account_info[0]

        try:
            log.step(account_id + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_id)
                video_path = os.path.join(VIDEO_TEMP_PATH, account_id)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id)
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_id)

            page_count = 1
            image_count = 1
            video_count = 1
            first_post_id = ""
            unique_list = []
            is_over = False
            need_make_image_dir = True
            need_make_video_dir = True
            while not is_over:
                post_url_list = get_one_page_post_url_list(account_id, page_count)
                if post_url_list is None:
                    log.error(account_id + " 无法访问第%s页相册页" % page_count)
                    tool.process_exit()

                if len(post_url_list) == 0:
                    # 下载完毕了
                    break

                log.trace(account_id + " 相册第%s页获取的所有信息页：%s" % (page_count, post_url_list))
                post_url_list_group_by_post_id = filter_post_url(post_url_list)
                log.trace(account_id + " 相册第%s页去重排序后的信息页：%s" % (page_count, post_url_list_group_by_post_id))
                log.step(account_id + " 相册第%s页获取到%s页信息页" % (page_count, len(post_url_list_group_by_post_id)))
                for post_id in sorted(post_url_list_group_by_post_id.keys(), reverse=True):
                    # 检查信息页id是否小于上次的记录
                    if post_id <= self.account_info[3]:
                        is_over = True
                        break

                    # 将第一个信息页的id做为新的存档记录
                    if first_post_id == "":
                        first_post_id = post_id

                    # 获取信息页并截取head标签内的内容
                    post_url = "http://%s.tumblr.com/post/%s" % (account_id, post_id)
                    post_page_head = get_post_page_head(post_url, post_url_list_group_by_post_id[post_id])
                    if post_page_head is None:
                        log.error(account_id + " 无法访问信息页 %s" % post_url)
                        continue
                    if not post_page_head:
                        log.error(account_id + " 信息页 %s 截取head标签异常" % post_url)
                        continue

                    # 获取og_type（页面类型的是视频还是图片或其他）
                    og_type = tool.find_sub_string(post_page_head, '<meta property="og:type" content="', '" />')
                    if not og_type:
                        log.error(account_id + " 信息页 %s，'og:type'获取异常" % post_url)
                        continue

                    # 空、音频、引用，跳过
                    if og_type in ["tumblr-feed:entry", "tumblr-feed:audio", "tumblr-feed:quote", "tumblr-feed:link"]:
                        continue

                    # 新增信息页导致的重复判断
                    if post_id in unique_list:
                        continue
                    else:
                        unique_list.append(post_id)

                    # 视频下载
                    if IS_DOWNLOAD_VIDEO and og_type == "tumblr-feed:video":
                        video_list = get_video_list(account_id, post_id)
                        if video_list is None:
                            log.error(account_id + " 第%s个视频 日志id：%s无法访问播放页" % (video_count, post_id))
                        else:
                            if len(video_list) > 0:
                                for video_url, video_type in list(video_list):
                                    log.step(account_id + " 开始下载第%s个视频 %s" % (video_count, video_url))

                                    # 第一个视频，创建目录
                                    if need_make_video_dir:
                                        if not tool.make_dir(video_path, 0):
                                            log.error(account_id + " 创建视频下载目录 %s 失败" % video_path)
                                            tool.process_exit()
                                        need_make_video_dir = False

                                    file_type = video_type.split("/")[-1]
                                    video_file_path = os.path.join(video_path, "%04d.%s" % (video_count, file_type))
                                    if tool.save_net_file(video_url, video_file_path):
                                        log.step(account_id + " 第%s个视频下载成功" % video_count)
                                        video_count += 1
                                    else:
                                        log.error(account_id + " 第%s个视频 %s 下载失败" % (video_count, video_url))
                            else:
                                log.error(account_id + " 第%s个视频 日志id：%s 中没有找到视频" % (video_count, post_id))

                    # 图片下载
                    if IS_DOWNLOAD_IMAGE:
                        if og_type == "tumblr-feed:video":
                            page_image_url_list = []
                            video_image_url = tool.find_sub_string(post_page_head, '<meta property="og:image" content="', '" />')
                            if video_image_url:
                                page_image_url_list.append(video_image_url)
                        else:
                            page_image_url_list = re.findall('"(http[s]?://\w*[.]?media.tumblr.com/[^"]*)"', post_page_head)
                            log.trace(account_id + " 信息页 %s 过滤前的所有图片：%s" % (post_url, page_image_url_list))
                            # 过滤头像以及页面上找到不同分辨率的同一张图
                            page_image_url_list = filter_different_resolution_images(page_image_url_list)
                        log.trace(account_id + " 信息页 %s 获取的的所有图片：%s" % (post_url, page_image_url_list))
                        if len(page_image_url_list) > 0:
                            for image_url in page_image_url_list:
                                log.step(account_id + " 开始下载第%s张图片 %s" % (image_count, image_url))

                                # 第一张图片，创建目录
                                if need_make_image_dir:
                                    if not tool.make_dir(image_path, 0):
                                        log.error(account_id + " 创建图片下载目录 %s 失败" % image_path)
                                        tool.process_exit()
                                    need_make_image_dir = False

                                file_type = image_url.split(".")[-1]
                                image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                                if tool.save_net_file(image_url, image_file_path):
                                    log.step(account_id + " 第%s张图片下载成功" % image_count)
                                    image_count += 1
                                else:
                                    log.error(account_id + " 第%s张图片 %s 下载失败" % (image_count, image_url))
                        else:
                            log.error(account_id + " 第%s张图片 信息页 %s 中没有找到图片" % (image_count, post_url))

                if not is_over:
                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_PAGE_COUNT <= page_count:
                        is_over = True
                    else:
                        page_count += 1

            log.step(account_id + " 下载完毕，总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1))

            # 排序
            if IS_SORT:
                if image_count > 1:
                    destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id)
                    if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                        log.step(account_id + " 图片从下载目录移动到保存目录成功")
                    else:
                        log.error(account_id + " 创建图片保存目录 %s 失败" % destination_path)
                        tool.process_exit()
                if video_count > 1:
                    destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_id)
                    if robot.sort_file(video_path, destination_path, int(self.account_info[2]), 4):
                        log.step(account_id + " 视频从下载目录移动到保存目录成功")
                    else:
                        log.error(account_id + " 创建视频保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            # 新的存档记录
            if first_post_id != "":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = str(int(self.account_info[2]) + video_count - 1)
                self.account_info[3] = first_post_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_id + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_id + " 提前退出")
            else:
                log.error(account_id + " 异常退出")

예제 #14

파일 보기

파일: net.py 프로젝트: yxw19870806/Py3CrawlerLib

def http_request(url, method="GET", fields=None, binary_data=None, header_list=None, cookies_list=None, encode_multipart=False, json_decode=False,
                 is_auto_proxy=True, is_auto_redirect=True, is_gzip=True, is_url_encode=True, is_auto_retry=True, is_random_ip=True,
                 connection_timeout=NET_CONFIG["HTTP_CONNECTION_TIMEOUT"], read_timeout=NET_CONFIG["HTTP_READ_TIMEOUT"]):
    """Http request via urllib3

    :param url:
        the url which you want visit, start with "http://" or "https://"

    :param method:
        request method, value in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]

    :param fields:
        dictionary type of request data, will urlencode() them to string. like post data, query string, etc
        not work with binary_data

    :param binary_data:
        binary type of request data, not work with post_data

    :param header_list:
        customize header dictionary

    :param cookies_list:
        customize cookies dictionary, will replaced header_list["Cookie"]

    :param encode_multipart:
        see "encode_multipart" in urllib3.request_encode_body

    :param is_auto_proxy:
        is auto use proxy when init PROXY_HTTP_CONNECTION_POOL

    :param is_auto_redirect:
        is auto redirect, when response.status in [301, 302, 303, 307, 308]

    :param is_auto_retry:
        is auto retry, when response.status in [500, 502, 503, 504]

    :param connection_timeout:
        customize connection timeout seconds

    :param read_timeout:
        customize read timeout seconds

    :param is_random_ip:
        is counterfeit a request header with random ip, will replaced header_list["X-Forwarded-For"] and header_list["X-Real-Ip"]

    :param json_decode:
        is return a decoded json data when response status = 200
        if decode failure will replace response status with HTTP_RETURN_CODE_JSON_DECODE_ERROR
    """
    url = str(url).strip()
    if not (url.find("http://") == 0 or url.find("https://") == 0):
        return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID)
    method = method.upper()
    if method not in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]:
        return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID)
    if HTTP_CONNECTION_POOL is None:
        init_http_connection_pool()
    connection_pool = HTTP_CONNECTION_POOL
    if PROXY_HTTP_CONNECTION_POOL is not None and is_auto_proxy:
        connection_pool = PROXY_HTTP_CONNECTION_POOL
    if is_url_encode:
        url = url_encode(url)

    if header_list is None:
        header_list = {}

    # 设置User-Agent
    if "User-Agent" not in header_list:
        header_list["User-Agent"] = _random_user_agent()

    # 设置一个随机IP
    if is_random_ip:
        random_ip = _random_ip_address()
        header_list["X-Forwarded-For"] = random_ip
        header_list["X-Real-Ip"] = random_ip

    # 设置cookie
    if cookies_list:
        header_list["Cookie"] = build_header_cookie_string(cookies_list)

    # 设置压缩格式
    if is_gzip:
        header_list["Accept-Encoding"] = "gzip"

    # 超时设置
    timeout = urllib3.Timeout(connect=float(connection_timeout) if connection_timeout > 0 else None, read=read_timeout if read_timeout > 0 else None)

    retry_count = 0
    while True:
        thread_event.wait()
        if EXIT_FLAG:
            tool.process_exit(0)

        try:
            if method in ['DELETE', 'GET', 'HEAD', 'OPTIONS']:
                response = connection_pool.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, fields=fields)
            else:
                if binary_data is None:
                    response = connection_pool.request(method, url, fields=fields, encode_multipart=encode_multipart, headers=header_list,
                                                       redirect=is_auto_redirect, timeout=timeout)
                else:
                    response = connection_pool.request(method, url, body=binary_data, encode_multipart=encode_multipart, headers=header_list,
                                                       redirect=is_auto_redirect, timeout=timeout)
            if response.status == HTTP_RETURN_CODE_SUCCEED and json_decode:
                try:
                    response.json_data = json.loads(response.data.decode())
                except ValueError:
                    is_error = True
                    content_type = response.getheader("Content-Type")
                    if content_type is not None:
                        charset = tool.find_sub_string(content_type, "charset=", None)
                        if charset:
                            if charset == "gb2312":
                                charset = "GBK"
                            try:
                                response.json_data = json.loads(response.data.decode(charset))
                            except:
                                pass
                            else:
                                is_error = False
                    if is_error:
                        response.status = HTTP_RETURN_CODE_JSON_DECODE_ERROR
            elif response.status == 429:  # Too Many Requests
                output.print_msg(url + " Too Many Requests, sleep")
                time.sleep(60)
                continue
            elif response.status in [500, 502, 503, 504] and is_auto_retry:  # 服务器临时性错误，重试
                if retry_count < NET_CONFIG["HTTP_REQUEST_RETRY_COUNT"]:
                    retry_count += 1
                    time.sleep(30)
                    continue
                else:
                    return response
            return response
        except MemoryError:
            return ErrorResponse(HTTP_RETURN_CODE_RESPONSE_TO_LARGE)
        except Exception as e:
            message = str(e)
            if isinstance(e, urllib3.exceptions.ConnectTimeoutError):
                # 域名无法解析
                if message.find("[Errno 11004] getaddrinfo failed") >= 0:
                    return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED)
                elif message.find("[Errno 11001] getaddrinfo failed") >= 0:
                    return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED)
            elif isinstance(e, urllib3.exceptions.MaxRetryError):
                if message.find("Caused by ResponseError('too many redirects'") >= 0:
                    return ErrorResponse(HTTP_RETURN_CODE_TOO_MANY_REDIRECTS)
            elif isinstance(e, urllib3.exceptions.DecodeError):
                if message.find("'Received response with content-encoding: gzip, but failed to decode it.'") >= 0:
                    return http_request(url, method=method, fields=fields, binary_data=binary_data, header_list=header_list, cookies_list=cookies_list,
                                        encode_multipart=encode_multipart, json_decode=json_decode, is_auto_proxy=is_auto_proxy, is_auto_redirect=is_auto_redirect,
                                        is_gzip=False, is_url_encode=False, is_auto_retry=is_auto_retry, is_random_ip=is_random_ip,
                                        connection_timeout=connection_timeout, read_timeout=read_timeout)
            # import traceback
            # output.print_msg(message)
            # output.print_msg(traceback.format_exc())
            output.print_msg(url + " 访问超时，重试中")
            time.sleep(5)

        retry_count += 1
        if retry_count >= NET_CONFIG["HTTP_REQUEST_RETRY_COUNT"]:
            output.print_msg("无法访问页面：" + url)
            return ErrorResponse(HTTP_RETURN_CODE_RETRY)

예제 #15

파일 보기

파일: instagram.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT
        global TOTAL_VIDEO_COUNT

        account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
                video_path = os.path.join(VIDEO_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)

            account_id = get_account_id(account_name)
            if account_id is None:
                log.error(account_name + " account id 查找失败")
                tool.process_exit()

            image_count = 1
            video_count = 1
            cursor = INIT_CURSOR
            first_created_time = "0"
            is_over = False
            need_make_image_dir = True
            need_make_video_dir = True
            while not is_over:
                # 获取指定时间后的一页媒体信息
                media_data = get_one_page_media_data(account_id, cursor)
                if media_data is None:
                    log.error(account_name + " 媒体列表解析异常")
                    tool.process_exit()

                nodes_data = media_data["nodes"]
                for photo_info in nodes_data:
                    if not robot.check_sub_key(("is_video", "display_src", "date"), photo_info):
                        log.error(account_name + " 媒体信息解析异常")
                        break
                    if photo_info["is_video"] and not robot.check_sub_key(("code",), photo_info):
                        log.error(account_name + " 视频code解析异常")
                        break

                    # 检查是否已下载到前一次的图片
                    if int(photo_info["date"]) <= int(self.account_info[3]):
                        is_over = True
                        break

                    # 将第一张图片的上传时间做为新的存档记录
                    if first_created_time == "0":
                        first_created_time = str(int(photo_info["date"]))

                    # 图片
                    if IS_DOWNLOAD_IMAGE:
                        image_url = str(photo_info["display_src"].split("?")[0])
                        log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url))

                        # 第一张图片，创建目录
                        if need_make_image_dir:
                            if not tool.make_dir(image_path, 0):
                                log.error(account_name + " 创建图片下载目录 %s 失败" % image_path)
                                tool.process_exit()
                            need_make_image_dir = False

                        file_type = image_url.split(".")[-1]
                        image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                        if tool.save_net_file(image_url, image_file_path):
                            log.step(account_name + " 第%s张图片下载成功" % image_count)
                            image_count += 1
                        else:
                            log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url))

                    # 视频
                    if IS_DOWNLOAD_VIDEO and photo_info["is_video"]:
                        # 根据日志ID获取视频下载地址
                        video_url = get_video_url(photo_info["code"])
                        if video_url is None:
                            log.error(account_name + " 第%s个视频code：%s 无法访问" % (video_count, photo_info["code"]))
                            continue
                        if not video_url:
                            log.error(account_name + " 第%s个视频code：%s 没有获取到下载地址" % (video_count, photo_info["code"]))
                            continue

                        log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_url))

                        # 第一个视频，创建目录
                        if need_make_video_dir:
                            if not tool.make_dir(video_path, 0):
                                log.error(account_name + " 创建视频下载目录 %s 失败" % video_path)
                                tool.process_exit()
                            need_make_video_dir = False

                        file_type = video_url.split(".")[-1]
                        video_file_path = os.path.join(video_path, "%04d.%s" % (video_count, file_type))
                        if tool.save_net_file(video_url, video_file_path):
                            log.step(account_name + " 第%s个视频下载成功" % video_count)
                            video_count += 1
                        else:
                            log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url))

                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_IMAGE_COUNT < image_count:
                        is_over = True
                        break

                if not is_over:
                    if media_data["page_info"]["has_next_page"]:
                        cursor = str(media_data["page_info"]["end_cursor"])
                    else:
                        is_over = True

            log.step(account_name + " 下载完毕，总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1))

            # 排序
            if IS_SORT:
                if image_count > 1:
                    destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                        log.step(account_name + " 图片从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path)
                        tool.process_exit()
                if video_count > 1:
                    destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(video_path, destination_path, int(self.account_info[2]), 4):
                        log.step(account_name + " 视频从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            # 新的存档记录
            if first_created_time != "0":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = str(int(self.account_info[2]) + video_count - 1)
                self.account_info[3] = first_created_time

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_name)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #16

파일 보기

파일: fkoji.py 프로젝트: yxw19870806/yxw19870806

    def main(self):
        # 解析存档文件
        # 寻找fkoji.save
        account_list = robot.read_save_data(self.save_data_path, 0, ["", "", ""])

        # 这个key的内容为总数据
        if ALL_SIGN in account_list:
            image_start_index = int(account_list[ALL_SIGN][1])
            save_data_image_time = int(account_list[ALL_SIGN][2])
            account_list.pop(ALL_SIGN)
        else:
            image_start_index = 0
            save_data_image_time = 0

        if self.is_sort:
            image_path = self.image_temp_path
        else:
            image_path = self.image_download_path

        if not tool.make_dir(image_path, 0):
            # 图片保存目录创建失败
            self.print_msg("图片下载目录%s创建失败！" % self.image_download_path)
            tool.process_exit()

        # 下载
        page_index = 1
        image_count = 1
        first_image_time = 0
        unique_list = []
        is_over = False

        while not is_over:
            index_url = "http://jigadori.fkoji.com/?p=%s" % page_index
            index_page_return_code, index_page_response = tool.http_request(index_url)[:2]
            if index_page_return_code != 1:
                log.error("无法访问首页地址 %s" % index_url)
                tool.process_exit()

            index_page = BeautifulSoup.BeautifulSoup(index_page_response)
            photo_list = index_page.body.findAll("div", "photo")
            # 已经下载到最后一页
            if not photo_list:
                break
            for photo_info in photo_list:
                if isinstance(photo_info, BeautifulSoup.NavigableString):
                    continue

                # 从图片页面中解析获取推特发布时间的时间戳
                tweet_created_time = get_tweet_created_time(photo_info)
                if tweet_created_time is None:
                    log.error("第%s张图片，解析tweet-created-at失败" % image_count)
                    continue

                # 下载完毕
                if tweet_created_time <= save_data_image_time:
                    is_over = True
                    break

                # 将第一张图片的上传时间做为新的存档记录
                if first_image_time == 0:
                    first_image_time = tweet_created_time

                # 从图片页面中解析获取推特发布账号
                account_id = get_tweet_account_id(photo_info)
                if account_id is None:
                    log.error("第%s张图片，解析tweet账号失败" % image_count)
                    continue

                # 找图片
                img_tags = photo_info.findAll("img")
                for tag in img_tags:
                    tag_attr = dict(tag.attrs)
                    if robot.check_sub_key(("src", "alt"), tag_attr):
                        image_url = str(tag_attr["src"]).replace(" ", "")
                        # 新增图片导致的重复判断
                        if image_url in unique_list:
                            continue
                        else:
                            unique_list.append(image_url)

                        log.step("开始下载第%s张图片 %s" % (image_count, image_url))

                        file_type = image_url.split(".")[-1]
                        if file_type.find("/") != -1:
                            file_type = "jpg"
                        file_path = os.path.join(image_path, "%05d_%s.%s" % (image_count, account_id, file_type))
                        if tool.save_net_file(image_url, file_path):
                            log.step("第%s张图片下载成功" % image_count)
                            image_count += 1
                        else:
                            log.error("第%s张图片 %s，account_id：%s，下载失败" % (image_count, image_url, account_id))
                if is_over:
                    break

            if not is_over:
                page_index += 1

        log.step("下载完毕")

        # 排序复制到保存目录
        if self.is_sort:
            is_check_ok = False
            while not is_check_ok:
                # 等待手动检测所有图片结束
                input_str = raw_input(tool.get_time() + " 已经下载完毕，是否下一步操作？ (Y)es or (N)o: ")
                input_str = input_str.lower()
                if input_str in ["y", "yes"]:
                    is_check_ok = True
                elif input_str in ["n", "no"]:
                    tool.process_exit()

            all_path = os.path.join(self.image_download_path, "all")
            if not tool.make_dir(all_path, 0):
                log.error("创建目录 %s 失败" % all_path)
                tool.process_exit()

            file_list = tool.get_dir_files_name(self.image_temp_path, "desc")
            for file_name in file_list:
                image_path = os.path.join(self.image_temp_path, file_name)
                file_name_list = file_name.split(".")
                file_type = file_name_list[-1]
                account_id = "_".join(".".join(file_name_list[:-1]).split("_")[1:])

                # 所有
                image_start_index += 1
                destination_file_name = "%05d_%s.%s" % (image_start_index, account_id, file_type)
                destination_path = os.path.join(all_path, destination_file_name)
                tool.copy_files(image_path, destination_path)

                # 单个
                each_account_path = os.path.join(self.image_download_path, "single", account_id)
                if not os.path.exists(each_account_path):
                    if not tool.make_dir(each_account_path, 0):
                        log.error("创建目录 %s 失败" % each_account_path)
                        tool.process_exit()
                if account_id in account_list:
                    account_list[account_id][1] = int(account_list[account_id][1]) + 1
                else:
                    account_list[account_id] = [account_id, 1]
                destination_file_name = "%05d.%s" % (account_list[account_id][1], file_type)
                destination_path = os.path.join(each_account_path, destination_file_name)
                tool.copy_files(image_path, destination_path)

            log.step("图片从下载目录移动到保存目录成功")

            # 删除临时文件夹
            tool.remove_dir(self.image_temp_path)

        # 保存新的存档文件
        temp_list = [account_list[key] for key in sorted(account_list.keys())]
        # 把总数据插入列表头
        temp_list.insert(0, [ALL_SIGN, str(image_start_index), str(first_image_time)])
        tool.write_file(tool.list_to_string(temp_list), self.save_data_path, 2)

        log.step("全部下载完毕，耗时%s秒，共计图片%s张" % (self.get_run_time(), image_count - 1))

예제 #17

파일 보기

파일: lofter.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT

        account_id = self.account_info[0]

        try:
            log.step(account_id + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_id)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id)

            # 图片下载
            page_count = 1
            image_count = 1
            first_post_id = ""
            unique_list = []
            is_over = False
            need_make_download_dir = True
            while not is_over:
                post_url_list = get_one_page_post_url_list(account_id, page_count)
                # 无法获取信息首页
                if post_url_list is None:
                    log.error(account_id + " 无法访问第%s页相册页" % page_count)
                    tool.process_exit()

                if len(post_url_list) == 0:
                    # 下载完毕了
                    break

                # 去重排序
                log.trace(account_id + " 相册第%s页获取的所有信息页：%s" % (page_count, post_url_list))
                post_url_list = sorted(list(set(post_url_list)), reverse=True)
                log.trace(account_id + " 相册第%s页去重排序后的信息页：%s" % (page_count, post_url_list))
                for post_url in post_url_list:
                    post_id = post_url.split("/")[-1].split("_")[-1]

                    # 检查是否已下载到前一次的图片
                    if post_id <= self.account_info[2]:
                        is_over = True
                        break

                    # 新增信息页导致的重复判断
                    if post_id in unique_list:
                        continue
                    else:
                        unique_list.append(post_id)
                    # 将第一个信息页的id做为新的存档记录
                    if first_post_id == "":
                        first_post_id = post_id

                    post_page_return_code, post_page = tool.http_request(post_url)[:2]
                    if post_page_return_code != 1:
                        log.error(account_id + " 第%s张图片，无法获取信息页 %s" % (image_count, post_url))
                        continue

                    image_url_list = get_image_url_list(post_page)
                    log.trace(account_id + " 信息页 %s 获取的所有图片：%s" % (post_url, image_url_list))
                    if len(image_url_list) == 0:
                        log.error(account_id + " 第%s张图片，信息页 %s 中没有找到图片" % (image_count, post_url))
                        continue
                    for image_url in image_url_list:
                        if image_url.rfind("?") > image_url.rfind("."):
                            image_url = image_url.split("?")[0]
                        log.step(account_id + " 开始下载第%s张图片 %s" % (image_count, image_url))

                        # 第一张图片，创建目录
                        if need_make_download_dir:
                            if not tool.make_dir(image_path, 0):
                                log.error(account_id + " 创建图片下载目录 %s 失败" % image_path)
                                tool.process_exit()
                            need_make_download_dir = False

                        file_type = image_url.split(".")[-1]
                        file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                        if tool.save_net_file(image_url, file_path):
                            log.step(account_id + " 第%s张图片下载成功" % image_count)
                            image_count += 1
                        else:
                            log.error(account_id + " 第%s张图片 %s 下载失败" % (image_count, image_url))

                        # 达到配置文件中的下载数量，结束
                        if 0 < GET_IMAGE_COUNT < image_count:
                            is_over = True
                            break

                    if is_over:
                        break

                if not is_over:
                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_PAGE_COUNT <= page_count:
                        is_over = True
                    else:
                        page_count += 1

            log.step(account_id + " 下载完毕，总共获得%s张图片" % (image_count - 1))

            # 排序
            if IS_SORT and image_count > 1:
                destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id)
                if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                    log.step(account_id + " 图片从下载目录移动到保存目录成功")
                else:
                    log.error(account_id + " 创建图片子目录 %s 失败" % destination_path)
                    tool.process_exit()

            # 新的存档记录
            if first_post_id != "":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = first_post_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_id + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_id + " 提前退出")
            else:
                log.error(account_id + " 异常退出")

예제 #18

파일 보기

파일: net.py 프로젝트: zhoushuqiang/PyCrawler

def http_request(url, method="GET", fields=None, binary_data=None, header_list=None, cookies_list=None, encode_multipart=False, is_auto_redirect=True,
                 is_auto_retry=True, connection_timeout=HTTP_CONNECTION_TIMEOUT, read_timeout=HTTP_READ_TIMEOUT, is_random_ip=True, json_decode=False):
    """Http request via urllib3

    :param url:
        the url which you want visit, start with "http://" or "https://"

    :param method:
        request method, value in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]

    :param fields:
        dictionary type of request data, will urlencode() them to string. like post data, query string, etc
        not work with binary_data

    :param binary_data:
        binary type of request data, not work with post_data

    :param header_list:
        customize header dictionary

    :param cookies_list:
        customize cookies dictionary, will replaced header_list["Cookie"]

    :param encode_multipart:
        see "encode_multipart" in urllib3.request_encode_body

    :param is_auto_redirect:
        is auto redirect, when response.status in [301, 302, 303, 307, 308]

    :param is_auto_retry:
        is auto retry, when response.status in [500, 502, 503, 504]

    :param connection_timeout:
        customize connection timeout seconds

    :param read_timeout:
        customize read timeout seconds

    :param is_random_ip:
        is counterfeit a request header with random ip, will replaced header_list["X-Forwarded-For"] and header_list["X-Real-Ip"]

    :param json_decode:
        is return a decoded json data when response status = 200
        if decode failure will replace response status with HTTP_RETURN_CODE_JSON_DECODE_ERROR
    """
    if not (url.find("http://") == 0 or url.find("https://") == 0):
        return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID)
    method = method.upper()
    if method not in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]:
        return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID)
    if HTTP_CONNECTION_POOL is None:
        init_http_connection_pool()

    if header_list is None:
        header_list = {}

    # 设置User-Agent
    if "User-Agent" not in header_list:
        header_list["User-Agent"] = _random_user_agent()

    # 设置一个随机IP
    if is_random_ip:
        random_ip = _random_ip_address()
        header_list["X-Forwarded-For"] = random_ip
        header_list["X-Real-Ip"] = random_ip

    # 设置cookie
    if cookies_list:
        header_list["Cookie"] = build_header_cookie_string(cookies_list)

    # 超时设置
    if connection_timeout == 0 and read_timeout == 0:
        timeout = None
    elif connection_timeout == 0:
        timeout = urllib3.Timeout(read=read_timeout)
    elif read_timeout == 0:
        timeout = urllib3.Timeout(connect=connection_timeout)
    else:
        timeout = urllib3.Timeout(connect=connection_timeout, read=read_timeout)

    retry_count = 0
    while True:
        if process.PROCESS_STATUS == process.PROCESS_STATUS_STOP:
            tool.process_exit(0)
        thread_event.wait()

        try:
            if method in ['DELETE', 'GET', 'HEAD', 'OPTIONS']:
                response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, fields=fields)
            else:
                if binary_data is None:
                    response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, fields=fields, encode_multipart=encode_multipart)
                else:
                    response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, body=binary_data, encode_multipart=encode_multipart)
            if response.status == HTTP_RETURN_CODE_SUCCEED and json_decode:
                try:
                    response.json_data = json.loads(response.data)
                except ValueError:
                    is_error = True
                    content_type = response.getheader("Content-Type")
                    if content_type is not None:
                        charset = tool.find_sub_string(content_type, "charset=", None)
                        if charset:
                            if charset == "gb2312":
                                charset = "GBK"
                            try:
                                response.json_data = json.loads(response.data.decode(charset))
                            except:
                                pass
                            else:
                                is_error = False
                    if is_error:
                        response.status = HTTP_RETURN_CODE_JSON_DECODE_ERROR
            elif response.status in [500, 502, 503, 504] and is_auto_retry:  # 服务器临时性错误，重试
                if retry_count < HTTP_REQUEST_RETRY_COUNT:
                    retry_count += 1
                    time.sleep(30)
                    continue
                else:
                    return response
            return response
        except urllib3.exceptions.ConnectTimeoutError, e:
            # 域名无法解析
            if str(e).find("[Errno 11004] getaddrinfo failed") >= 0:
                return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED)
            pass
        except MemoryError:
            return ErrorResponse(HTTP_RETURN_CODE_RESPONSE_TO_LARGE)

예제 #19

파일 보기

파일: changba.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_VIDEO_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 3 and self.account_info[2]:
            account_name = self.account_info[2]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)

            user_id = get_user_id(account_id)
            if user_id is None:
                log.error(account_name + " userid获取失败")
                tool.process_exit()

            page_count = 0
            video_count = 1
            first_audio_id = "0"
            unique_list = []
            is_over = False
            need_make_download_dir = True
            while not is_over:
                # 获取指定一页的歌曲信息
                audio_list = get_one_page_audio_list(user_id, page_count)

                if audio_list is None:
                    log.step(account_name + " 第%s页歌曲列表获取失败" % page_count)
                    first_audio_id = "0"
                    break  # 存档恢复

                # 如果为空，表示已经取完了
                if len(audio_list) == 0:
                    break

                for audio_info in list(audio_list):
                    audio_id = audio_info[0]

                    # 检查是否歌曲id小于上次的记录
                    if int(audio_id) <= int(self.account_info[1]):
                        is_over = True
                        break

                    # 新增歌曲导致的重复判断
                    if audio_id in unique_list:
                        continue
                    else:
                        unique_list.append(audio_id)
                    # 将第一首歌曲id做为新的存档记录
                    if first_audio_id == "0":
                        first_audio_id = str(audio_id)

                    # 获取歌曲的下载地址
                    audio_url = get_audio_url(audio_info[2])
                    log.step(account_name + " 开始下载第%s首歌曲 %s" % (video_count, audio_url))

                    # 第一首歌曲，创建目录
                    if need_make_download_dir:
                        if not tool.make_dir(video_path, 0):
                            log.error(account_name + " 创建歌曲下载目录 %s 失败" % video_path)
                            tool.process_exit()
                        need_make_download_dir = False

                    file_path = os.path.join(video_path, "%s - %s.mp3" % (audio_id, audio_info[1]))
                    if tool.save_net_file(audio_url, file_path):
                        log.step(account_name + " 第%s首歌曲下载成功" % video_count)
                        video_count += 1
                    else:
                        log.error(account_name + " 第%s首歌曲 %s 下载失败" % (video_count, audio_url))

                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_VIDEO_COUNT < video_count:
                        is_over = True
                        break

                if not is_over:
                    # 获取的歌曲数量少于1页的上限，表示已经到结束了
                    # 如果歌曲数量正好是页数上限的倍数，则由下一页获取是否为空判断
                    if len(audio_list) < 20:
                        is_over = True
                    else:
                        page_count += 1

            log.step(account_name + " 下载完毕，总共获得%s首歌曲" % (video_count - 1))

            # 新的存档记录
            if first_audio_id != "0":
                self.account_info[1] = first_audio_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #20

파일 보기

파일: robot.py 프로젝트: yxw19870806/yxw19870806

    def __init__(self, sys_config, extra_config=None):
        global IS_INIT
        self.start_time = time.time()

        # 程序启动配置
        if not isinstance(sys_config, dict):
            self.print_msg("程序启动配置不存在，请检查代码！")
            tool.process_exit()
            return
        sys_download_image = SYS_DOWNLOAD_IMAGE in sys_config
        sys_download_video = SYS_DOWNLOAD_VIDEO in sys_config
        sys_set_proxy = SYS_SET_PROXY in sys_config
        sys_set_cookie = SYS_SET_COOKIE in sys_config
        sys_not_check_save_data = SYS_NOT_CHECK_SAVE_DATA in sys_config

        # exe程序
        if tool.IS_EXECUTABLE:
            application_path = os.path.dirname(sys.executable)
            os.chdir(application_path)
            config_path = os.path.join(os.getcwd(), "data\\config.ini")
        else:
            config_path = os.path.join(os.getcwd(), "..\\common\\config.ini")

        config = read_config(config_path)

        if not isinstance(extra_config, dict):
            extra_config = {}

        # 日志
        self.is_show_error = get_config(config, "IS_SHOW_ERROR", True, 2)
        self.is_show_step = get_config(config, "IS_SHOW_STEP", True, 2)
        self.is_show_trace = get_config(config, "IS_SHOW_TRACE", False, 2)
        error_log_path = get_config(config, "ERROR_LOG_PATH", "log/errorLog.txt", 3)
        self.error_log_path = replace_path(error_log_path)
        error_log_dir = os.path.dirname(self.error_log_path)

        if not tool.make_dir(error_log_dir, 0):
            self.print_msg("创建错误日志目录 %s 失败" % error_log_dir)
            tool.process_exit()
            return
        is_log_step = get_config(config, "IS_LOG_STEP", True, 2)
        if not is_log_step:
            self.step_log_path = ""
        else:
            step_log_path = get_config(config, "STEP_LOG_PATH", "log/stepLog.txt", 3)
            self.step_log_path = replace_path(step_log_path)
            # 日志文件保存目录
            step_log_dir = os.path.dirname(self.step_log_path)
            if not tool.make_dir(step_log_dir, 0):
                self.print_msg("创建步骤日志目录 %s 失败" % step_log_dir)
                tool.process_exit()
                return
        is_log_trace = get_config(config, "IS_LOG_TRACE", True, 2)
        if not is_log_trace:
            self.trace_log_path = ""
        else:
            trace_log_path = get_config(config, "TRACE_LOG_PATH", "log/traceLog.txt", 3)
            self.trace_log_path = replace_path(trace_log_path)
            # 日志文件保存目录
            trace_log_dir = os.path.dirname(self.trace_log_path)
            if not tool.make_dir(trace_log_dir, 0):
                self.print_msg("创建调试日志目录 %s 失败" % trace_log_dir)
                tool.process_exit()
                return

        if not IS_INIT:
            log.IS_SHOW_ERROR = self.is_show_error
            log.IS_SHOW_STEP = self.is_show_step
            log.IS_SHOW_TRACE = self.is_show_trace
            log.ERROR_LOG_PATH = self.error_log_path
            log.STEP_LOG_PATH = self.step_log_path
            log.TRACE_LOG_PATH = self.trace_log_path
            IS_INIT = True

        # 是否下载
        self.is_download_image = get_config(config, "IS_DOWNLOAD_IMAGE", True, 2) and sys_download_image
        self.is_download_video = get_config(config, "IS_DOWNLOAD_VIDEO", True, 2) and sys_download_video

        if not self.is_download_image and not self.is_download_video:
            # 下载图片和视频都没有开启，请检查配置
            if not self.is_download_image and sys_download_image and not self.is_download_video and sys_download_video:
                self.print_msg("下载图片和视频都没有开启，请检查配置！")
            elif not self.is_download_image and sys_download_image:
                self.print_msg("下载图片没有开启，请检查配置！")
            elif not self.is_download_video and sys_download_video:
                self.print_msg("下载视频没有开启，请检查配置！")
            tool.process_exit()
            return

        # 存档
        if "save_data_path" in extra_config:
            self.save_data_path = extra_config["save_data_path"]
        else:
            self.save_data_path = get_config(config, "SAVE_DATA_PATH", "info/save.data", 3)
        if not sys_not_check_save_data and not os.path.exists(self.save_data_path):
            # 存档文件不存在
            self.print_msg("存档文件%s不存在！" % self.save_data_path)
            tool.process_exit()
            return

        # 是否需要下载图片
        if self.is_download_image:
            # 图片保存目录
            if "image_download_path" in extra_config:
                self.image_download_path = extra_config["image_download_path"]
            else:
                self.image_download_path = get_config(config, "IMAGE_DOWNLOAD_PATH", "photo", 3)
            if not tool.make_dir(self.image_download_path, 0):
                # 图片保存目录创建失败
                self.print_msg("图片保存目录%s创建失败！" % self.image_download_path)
                tool.process_exit()
                return
            # 图片临时下载目录
            if "image_temp_path" in extra_config:
                self.image_temp_path = extra_config["image_temp_path"]
            else:
                self.image_temp_path = get_config(config, "IMAGE_TEMP_PATH", "tempImage", 3)
            # 图片下载数量，0为下载全部可用资源
            self.get_image_count = get_config(config, "GET_IMAGE_COUNT", 0, 1)
        else:
            self.image_download_path = ""
            self.image_temp_path = ""
            self.get_image_count = 0
        # 是否需要下载视频
        if self.is_download_video:
            # 视频保存目录
            if "video_download_path" in extra_config:
                self.video_download_path = extra_config["video_download_path"]
            else:
                self.video_download_path = get_config(config, "VIDEO_DOWNLOAD_PATH", "video", 3)
            if not tool.make_dir(self.video_download_path, 0):
                # 视频保存目录创建失败
                self.print_msg("视频保存目录%s创建失败！" % self.video_download_path)
                tool.process_exit()
                return
            # 视频下载临时目录
            if "video_temp_path" in extra_config:
                self.video_temp_path = extra_config["video_temp_path"]
            else:
                self.video_temp_path = get_config(config, "VIDEO_TEMP_PATH", "tempVideo", 3)
            # 视频下载数量，0为下载全部可用资源
            self.get_video_count = get_config(config, "GET_VIDEO_COUNT", 0, 1)
        else:
            self.video_download_path = ""
            self.video_temp_path = ""
            self.get_video_count = 0
        # 是否需要重新排序图片
        self.is_sort = get_config(config, "IS_SORT", True, 2)
        self.get_page_count = get_config(config, "GET_PAGE_COUNT", 0, 1)

        # 代理
        is_proxy = get_config(config, "IS_PROXY", 2, 1)
        if is_proxy == 1 or (is_proxy == 2 and sys_set_proxy):
            proxy_ip = get_config(config, "PROXY_IP", "127.0.0.1", 0)
            proxy_port = get_config(config, "PROXY_PORT", "8087", 0)
            tool.set_proxy(proxy_ip, proxy_port)

        # cookies
        if sys_set_cookie:
            if sys_config[SYS_SET_COOKIE]:  # 加载浏览器cookie
                # 操作系统&浏览器
                browser_type = get_config(config, "BROWSER_TYPE", 2, 1)
                # cookie
                is_auto_get_cookie = get_config(config, "IS_AUTO_GET_COOKIE", True, 2)
                if is_auto_get_cookie:
                    cookie_path = tool.get_default_browser_cookie_path(browser_type)
                else:
                    cookie_path = get_config(config, "COOKIE_PATH", "", 0)
                if not tool.set_cookie_from_browser(cookie_path, browser_type, sys_config[SYS_SET_COOKIE]):
                    self.print_msg("导入浏览器cookies失败")
                    tool.process_exit()
                    return
            else:  # 使用空cookie
                tool.set_empty_cookie()

        # Http Setting
        tool.HTTP_CONNECTION_TIMEOUT = get_config(config, "HTTP_CONNECTION_TIMEOUT", 10, 1)
        tool.HTTP_REQUEST_RETRY_COUNT = get_config(config, "HTTP_REQUEST_RETRY_COUNT", 100, 1)

        # 线程数
        self.thread_count = get_config(config, "THREAD_COUNT", 10, 1)
        self.thread_lock = threading.Lock()

        # 键盘监控线程
        if get_config(config, "IS_KEYBOARD_EVENT", True, 2):
            keyboard_event_bind = {}
            pause_process_key = get_config(config, "PAUSE_PROCESS_KEYBOARD_KEY", "F9", 0)
            if pause_process_key:
                keyboard_event_bind[pause_process_key] = process.pause_process
            continue_process_key = get_config(config, "CONTINUE_PROCESS_KEYBOARD_KEY", "F10", 0)
            if continue_process_key:
                keyboard_event_bind[continue_process_key] = process.continue_process
            if keyboard_event_bind:
                keyboard_control_thread = keyboardEvent.KeyboardEvent(keyboard_event_bind)
                keyboard_control_thread.setDaemon(True)
                keyboard_control_thread.start()

        # 启用线程监控是否需要暂停其他下载线程
        process_control_thread = process.ProcessControl()
        process_control_thread.setDaemon(True)
        process_control_thread.start()

        self.print_msg("初始化完成")

예제 #21

파일 보기

파일: twitter.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT
        global TOTAL_VIDEO_COUNT

        account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
                video_path = os.path.join(VIDEO_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)

            image_count = 1
            video_count = 1
            data_tweet_id = INIT_MAX_ID
            first_tweet_id = "0"
            is_over = False
            is_download_image = IS_DOWNLOAD_IMAGE
            is_download_video = IS_DOWNLOAD_VIDEO
            need_make_image_dir = True
            need_make_video_dir = True
            while not is_over:
                # 获取指定时间点后的一页图片信息
                media_page = get_media_page_data(account_name, data_tweet_id)
                if media_page is None:
                    log.error(account_name + " 媒体列表解析异常")
                    tool.process_exit()

                # 上一页正好获取了全部的媒体信息，所以这一页没有任何内容，完成了，直接退出
                if media_page["new_latent_count"] == 0 and not media_page["has_more_items"]:
                    break

                tweet_list = get_tweet_list(media_page["items_html"])
                if len(tweet_list) == 0:
                    log.error(account_name + " 媒体列表拆分异常，items_html：%s" % media_page["items_html"])
                    tool.process_exit()

                if media_page["new_latent_count"] != len(tweet_list):
                    log.error(account_name + " 解析的媒体数量不等于new_latent_count的数值")
                    # tool.process_exit()

                for tweet_data in tweet_list:
                    tweet_id = tool.find_sub_string(tweet_data, 'data-tweet-id="', '"')
                    if not tweet_id:
                        log.error(account_name + " tweet id解析异常，tweet数据：%s" % tweet_data)
                        continue

                    # 检查是否tweet的id小于上次的记录
                    if int(tweet_id) <= int(self.account_info[3]):
                        is_over = True
                        break

                    # 将第一个tweet的id做为新的存档记录
                    if first_tweet_id == "0":
                        first_tweet_id = tweet_id

                    # 视频
                    if is_download_image:
                        # 这个tweet是否包含视频
                        if check_has_video(tweet_data):
                            video_file_type, video_url_list = get_video_url_list(tweet_id)
                            if len(video_url_list) > 0:
                                log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_url_list))

                                # 第一个视频，创建目录
                                if need_make_video_dir:
                                    if not tool.make_dir(video_path, 0):
                                        log.error(account_name + " 创建图片下载目录 %s 失败" % video_path)
                                        tool.process_exit()
                                    need_make_video_dir = False

                                video_file_path = os.path.join(video_path, "%04d.%s" % (video_count, video_file_type))
                                if save_video(video_url_list, video_file_path):
                                    log.step(account_name + " 第%s个视频下载成功" % video_count)
                                    video_count += 1
                                else:
                                    log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url_list))
                            else:
                                log.error(account_name + " 第%s个视频 没有获取到源地址，tweet id：%s" % (video_count, tweet_id))

                        # 达到配置文件中的下载数量，结束图片下载
                        if 0 < GET_IMAGE_COUNT < image_count:
                            is_download_image = False

                    # 图片
                    if is_download_video:
                        # 匹配获取全部的图片地址
                        image_url_list = get_image_url_list(tweet_data)
                        for image_url in image_url_list:
                            image_url = str(image_url)
                            log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url))

                            image_return_code, image_byte = tool.http_request(image_url)[:2]
                            # 404，不算做错误，图片已经被删掉了
                            if image_return_code == -404:
                                log.error(account_name + " 第%s张图片 %s 已被删除，跳过" % (image_count, image_url))
                            elif image_return_code == 1:
                                # 第一张图片，创建目录
                                if need_make_image_dir:
                                    if not tool.make_dir(image_path, 0):
                                        log.error(account_name + " 创建图片下载目录 %s 失败" % image_path)
                                        tool.process_exit()
                                    need_make_image_dir = False

                                file_type = image_url.split(".")[-1].split(":")[0]
                                image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                                save_image(image_byte, image_file_path)
                                log.step(account_name + " 第%s张图片下载成功" % image_count)
                                image_count += 1
                            else:
                                log.error(account_name + " 第%s张图片 %s 获取失败" % (image_count, image_url))

                        # 达到配置文件中的下载数量，结束视频下载
                        if 0 < GET_VIDEO_COUNT < video_count:
                            is_download_video = False

                    # 全部达到配置文件中的下载数量，结束
                    if not is_download_image and not is_download_video:
                        is_over = True
                        break

                if not is_over:
                    # 查找下一页的data_tweet_id
                    if media_page["has_more_items"]:
                        data_tweet_id = str(media_page["min_position"])
                    else:
                        is_over = True

            log.step(account_name + " 下载完毕，总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1))

            # 排序
            if IS_SORT:
                if image_count > 1:
                    destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                        log.step(account_name + " 图片从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建图片子目录 %s 失败" % destination_path)
                        tool.process_exit()
                if video_count > 1:
                    destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(video_path, destination_path, int(self.account_info[2]), 4):
                        log.step(account_name + " 视频从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            # 新的存档记录
            if first_tweet_id != "0":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = str(int(self.account_info[2]) + video_count - 1)
                self.account_info[3] = first_tweet_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_name)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #22

파일 보기

파일: shinoda.py 프로젝트: yxw19870806/yxw19870806

    def main(self):
        # 解析存档文件
        last_blog_id = ""
        image_start_index = 0
        if os.path.exists(self.save_data_path):
            save_file = open(self.save_data_path, "r")
            save_info = save_file.read()
            save_file.close()
            save_info = save_info.split("\t")
            if len(save_info) >= 2:
                image_start_index = int(save_info[0])
                last_blog_id = save_info[1]

        # 下载
        page_index = 1
        image_count = 1
        is_over = False
        new_last_blog_id = ""
        if self.is_sort:
            image_path = self.image_temp_path
        else:
            image_path = self.image_download_path
        while not is_over:
            index_url = "http://blog.mariko-shinoda.net/page%s.html" % (page_index - 1)
            index_page_return_code, index_page = tool.http_request(index_url)[:2]

            if index_page_return_code == 1:
                image_name_list = re.findall('data-original="./([^"]*)"', index_page)
                for image_name in image_name_list:
                    blog_id = image_name.split("-")[0]

                    # 检查是否已下载到前一次的图片
                    if blog_id == last_blog_id:
                        is_over = True
                        break

                    # 将第一个博客的id做为新的存档记录
                    if new_last_blog_id == "":
                        new_last_blog_id = blog_id

                    image_url = "http://blog.mariko-shinoda.net/%s" % image_name
                    # 文件类型
                    file_type = image_url.split(".")[-1].split(":")[0]
                    file_path = os.path.join(image_path, "%05d.%s" % (image_count, file_type))
                    log.step("开始下载第%s张图片 %s" % (image_count, image_url))
                    if tool.save_net_file(image_url, file_path):
                        log.step("第%s张图片下载成功" % image_count)
                        image_count += 1
                    else:
                        log.step("第%s张图片 %s 下载失败" % (image_count, image_url))
                page_index += 1
            else:
                log.error("无法访问博客页面 %s" % index_url)
                is_over = True

        log.step("下载完毕")

        # 排序复制到保存目录
        if self.is_sort:
            if robot.sort_file(self.image_temp_path, self.image_download_path, image_start_index, 5):
                log.step(" 图片从下载目录移动到保存目录成功")
            else:
                log.error(" 创建图片保存目录 %s 失败" % self.image_download_path)
                tool.process_exit()

        # 保存新的存档文件
        new_save_file_path = robot.get_new_save_file_path(self.save_data_path)
        log.step("保存新存档文件 %s" % new_save_file_path)
        new_save_file = open(new_save_file_path, "w")
        new_save_file.write(str(image_start_index) + "\t" + new_last_blog_id)
        new_save_file.close()

        log.step("全部下载完毕，耗时%s秒，共计图片%s张" % (self.get_run_time(), image_count - 1))

예제 #23

파일 보기

파일: fiveSing.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_VIDEO_COUNT
        global GET_PAGE_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 4 and self.account_info[3]:
            account_name = self.account_info[3]
        else:
            account_name = self.account_info[0]

        # 原创、翻唱
        audio_type_to_index = {"yc": 1, "fc": 2}
        try:
            log.step(account_name + " 开始")

            video_count = 1
            for audio_type in audio_type_to_index.keys():
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name, audio_type)

                page_count = 1
                first_audio_id = "0"
                unique_list = []
                is_over = False
                need_make_download_dir = True
                while not is_over:
                    # 获取指定一页的歌曲信息列表
                    audio_list = get_one_page_audio_list(account_id, audio_type, page_count)

                    if audio_list is None:
                        log.step(account_name + " 第%s页%s歌曲页面获取失败" % (page_count, audio_type))
                        first_audio_id = "0"
                        break  # 存档恢复

                    # 如果为空，表示已经取完了
                    if len(audio_list) == 0:
                        break

                    for audio_info in list(audio_list):
                        audio_id = audio_info[0]
                        # 过滤标题中不支持的字符
                        audio_title = robot.filter_text(audio_info[1])

                        # 检查是否歌曲id小于上次的记录
                        if int(audio_id) <= int(self.account_info[audio_type_to_index[audio_type]]):
                            is_over = True
                            break

                        # 新增歌曲导致的重复判断
                        if audio_id in unique_list:
                            continue
                        else:
                            unique_list.append(audio_id)
                        # 将第一首歌曲id做为新的存档记录
                        if first_audio_id == "0":
                            first_audio_id = str(audio_id)

                        # 获取歌曲的下载地址
                        audio_url = get_audio_url(audio_id, audio_type_to_index[audio_type])
                        if audio_url is None:
                            log.step(account_name + " %s歌曲ID %s，下载地址获取失败" % (audio_type, audio_id))
                            continue
                        if not audio_url:
                            log.step(account_name + " %s歌曲ID %s，暂不提供下载地址" % (audio_type, audio_id))
                            continue

                        log.step(account_name + " 开始下载第%s首歌曲 %s" % (video_count, audio_url))

                        # 第一首歌曲，创建目录
                        if need_make_download_dir:
                            if not tool.make_dir(video_path, 0):
                                log.error(account_name + " 创建歌曲下载目录 %s 失败" % video_path)
                                tool.process_exit()
                            need_make_download_dir = False

                        file_path = os.path.join(video_path, "%s - %s.mp3" % (audio_id, audio_title))
                        if tool.save_net_file(audio_url, file_path):
                            log.step(account_name + " 第%s首歌曲下载成功" % video_count)
                            video_count += 1
                        else:
                            log.error(account_name + " 第%s首歌曲 %s 下载失败" % (video_count, audio_url))

                        # 达到配置文件中的下载数量，结束
                        if 0 < GET_VIDEO_COUNT < video_count:
                            is_over = True
                            break

                    if not is_over:
                        # 达到配置文件中的下载页数，结束
                        if 0 < GET_PAGE_COUNT <= page_count:
                            is_over = True
                        # 获取的歌曲数量少于1页的上限，表示已经到结束了
                        # 如果歌曲数量正好是页数上限的倍数，则由下一页获取是否为空判断
                        elif len(audio_list) < 20:
                            is_over = True
                        else:
                            page_count += 1

                # 新的存档记录
                if first_audio_id != "0":
                    self.account_info[audio_type_to_index[audio_type]] = first_audio_id

            log.step(account_name + " 下载完毕，总共获得%s首歌曲" % (video_count - 1))

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #24

파일 보기

파일: weibo.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT
        global TOTAL_VIDEO_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 6 and self.account_info[5]:
            account_name = self.account_info[5]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
                video_path = os.path.join(VIDEO_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)

            # 视频
            video_count = 1
            account_page_id = None
            first_video_url = ""
            is_over = False
            need_make_video_dir = True
            since_id = INIT_SINCE_ID
            while IS_DOWNLOAD_VIDEO and (not is_over):
                # 获取page_id
                if account_page_id is None:
                    account_page_id = get_account_page_id(account_id)
                    if account_page_id is None:
                        log.error(account_name + " 微博主页没有获取到page_id")
                        break

                # 获取指定时间点后的一页视频信息
                video_page_data = get_one_page_video_data(account_page_id, since_id)
                if video_page_data is None:
                    log.error(account_name + " 视频列表解析异常")
                    first_video_url = ""  # 存档恢复
                    break

                # 匹配获取全部的视频页面
                video_play_url_list = get_video_play_url_list(video_page_data)
                log.trace(account_name + "since_id：%s中的全部视频：%s" % (since_id, video_play_url_list))
                for video_play_url in video_play_url_list:
                    # 检查是否是上一次的最后视频
                    if self.account_info[4] == video_play_url:
                        is_over = True
                        break

                    # 将第一个视频的地址做为新的存档记录
                    if first_video_url == "":
                        first_video_url = video_play_url

                    # 获取这个视频的下载地址
                    return_code, video_url_list = get_video_url(video_play_url)
                    if return_code != 1:
                        if return_code == -1:
                            log.error(account_name + " 第%s个视频 %s 没有获取到源地址" % (video_count, video_play_url))
                        elif return_code == -2:
                            log.error(account_name + " 第%s个视频 %s 无法访问" % (video_count, video_play_url))
                        elif return_code == -3:
                            log.error(account_name + " 第%s个视频 %s 暂不支持的视频源" % (video_count, video_play_url))
                        continue
                    log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_play_url))

                    # 第一个视频，创建目录
                    if need_make_video_dir:
                        if not tool.make_dir(video_path, 0):
                            log.error(account_name + " 创建图片下载目录 %s 失败" % video_path)
                            tool.process_exit()
                        need_make_video_dir = False

                    video_file_path = os.path.join(video_path, "%04d.mp4" % video_count)
                    for video_url in video_url_list:
                        if tool.save_net_file(video_url, video_file_path):
                            log.step(account_name + " 第%s个视频下载成功" % video_count)
                            video_count += 1
                        else:
                            log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url))

                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_VIDEO_COUNT < video_count:
                        is_over = True
                        break

                if not is_over:
                    # 获取下一页的since_id
                    since_id = tool.find_sub_string(video_page_data, "type=video&owner_uid=&since_id=", '">')
                    if not since_id:
                        break

            # 有历史记录，并且此次没有获得正常结束的标记，说明历史最后的视频已经被删除了
            if self.account_info[4] != "" and video_count > 1 and not is_over:
                log.error(account_name + " 没有找到上次下载的最后一个视频地址")

            # 图片
            image_count = 1
            page_count = 1
            first_image_time = "0"
            unique_list = []
            is_over = False
            need_make_image_dir = True
            while IS_DOWNLOAD_IMAGE and (not is_over):
                # 获取指定一页图片的信息
                photo_page_data = get_one_page_photo_data(account_id, page_count)
                if photo_page_data is None:
                    log.error(account_name + " 图片列表获取失败")
                    first_image_time = "0"  # 存档恢复
                    break

                log.trace(account_name + "第%s页的全部图片信息：%s" % (page_count, photo_page_data))
                for image_info in photo_page_data["photo_list"]:
                    if not robot.check_sub_key(("pic_host", "pic_name", "timestamp"), image_info):
                        log.error(account_name + " 第%s张图片信息解析失败 %s" % (image_count, image_info))
                        continue

                    # 检查是否图片时间小于上次的记录
                    if int(image_info["timestamp"]) <= int(self.account_info[2]):
                        is_over = True
                        break

                    # 新增图片导致的重复判断
                    if image_info["pic_name"] in unique_list:
                        continue
                    else:
                        unique_list.append(image_info["pic_name"])
                    # 将第一张图片的上传时间做为新的存档记录
                    if first_image_time == "0":
                        first_image_time = str(image_info["timestamp"])

                    image_url = str(image_info["pic_host"]) + "/large/" + str(image_info["pic_name"])
                    log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url))

                    # 获取图片的二进制数据，并且判断这个图片是否是可用的
                    image_status, image_byte = get_image_byte(image_url)
                    if image_status != 1:
                        if image_status == -1:
                            log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url))
                        elif image_status == -2:
                            log.error(account_name + " 第%s张图片 %s 资源已被删除，跳过" % (image_count, image_url))
                        continue

                    # 第一张图片，创建目录
                    if need_make_image_dir:
                        if not tool.make_dir(image_path, 0):
                            log.error(account_name + " 创建图片下载目录 %s 失败" % image_path)
                            tool.process_exit()
                        need_make_image_dir = False

                    file_type = image_url.split(".")[-1]
                    if file_type.find("/") != -1:
                        file_type = "jpg"
                    image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                    save_image(image_byte, image_file_path)
                    log.step(account_name + " 第%s张图片下载成功" % image_count)
                    image_count += 1

                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_IMAGE_COUNT < image_count:
                        is_over = True
                        break

                if not is_over:
                    # 根据总的图片数量和每页显示的图片数量，计算是否还有下一页
                    if (photo_page_data["total"] / IMAGE_COUNT_PER_PAGE) > (page_count - 1):
                        page_count += 1
                    else:
                        # 全部图片下载完毕
                        is_over = True

            log.step(account_name + " 下载完毕，总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1))

            # 排序
            if IS_SORT:
                if first_image_time != "0":
                    destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                        log.step(account_name + " 图片从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path)
                        tool.process_exit()
                if first_video_url != "":
                    destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(video_path, destination_path, int(self.account_info[3]), 4):
                        log.step(account_name + " 视频从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            # 新的存档记录
            if first_image_time != "0":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = first_image_time
            if first_video_url != "":
                self.account_info[3] = str(int(self.account_info[3]) + video_count - 1)
                self.account_info[4] = first_video_url

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #25

파일 보기

파일: flickr.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT

        account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)

            # 获取user id
            api_info = get_api_info(account_name)
            if api_info is None:
                log.error(account_name + " API信息查找失败")
                tool.process_exit()
            if not api_info["user_id"]:
                log.error(account_name + " user_id解析失败")
                tool.process_exit()
            if not api_info["site_key"]:
                log.error(account_name + " site_key解析失败")
                tool.process_exit()
            # 生成一个随机的request id用作访问（使用原理暂时不明，只管模拟页面传入）
            request_id = tool.generate_random_string(8)

            # 图片
            image_count = 1
            page_count = 1
            first_image_time = "0"
            is_over = False
            need_make_image_dir = True
            while not is_over:
                # 获取一页图片信息
                page_data = get_one_page_image_data(api_info["user_id"], page_count, api_info["site_key"], request_id)
                if page_data is None:
                    log.error(account_name + " 第%s页图片信息获取失败" % page_count)
                    tool.process_exit()

                for photo_info in page_data["photos"]["photo"]:
                    if "dateupload" not in photo_info:
                        log.error(account_name + " 第%s张图片上传时间获取失败，图片信息：%s" % (image_count, photo_info))
                        continue

                    # 检查是否是上一次的最后视频
                    if int(self.account_info[2]) >= int(photo_info["dateupload"]):
                        is_over = True
                        break

                    # 将第一张图片的上传时间做为新的存档记录
                    if first_image_time == "0":
                        first_image_time = str(photo_info["dateupload"])

                    if "url_o_cdn" in photo_info:
                        image_url = str(photo_info["url_o_cdn"])
                    elif "url_o" in photo_info:
                        image_url = str(photo_info["url_o"])
                    else:
                        log.error(account_name + " 第%s张图片下载地址获取失败，图片信息：%s" % (image_count, photo_info))
                        continue
                    log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url))

                    if need_make_image_dir:
                        if not tool.make_dir(image_path, 0):
                            log.error(account_name + " 创建图片下载目录 %s 失败" % image_path)
                            tool.process_exit()
                        need_make_image_dir = False

                    file_type = image_url.split(".")[-1]
                    file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                    if tool.save_net_file(image_url, file_path):
                        log.step(account_name + " 第%s张图片下载成功" % image_count)
                        image_count += 1
                    else:
                        log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url))

                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_IMAGE_COUNT < image_count:
                        is_over = True
                        break

                if not is_over:
                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_PAGE_COUNT <= page_count:
                        is_over = True
                    elif page_count >= int(page_data["photos"]["pages"]):
                        is_over = True
                    else:
                        page_count += 1

            log.step(account_name + " 下载完毕，总共获得%s张图片" % (image_count - 1))

            # 排序
            if IS_SORT:
                if first_image_time != "0":
                    destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                        log.step(account_name + " 图片从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            # 新的存档记录
            if first_image_time != "0":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = first_image_time

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            ACCOUNTS.remove(account_name)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #26

파일 보기

파일: weishi.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_VIDEO_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 4 and self.account_info[3]:
            account_name = self.account_info[3]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                video_path = os.path.join(VIDEO_TEMP_PATH, account_name)
            else:
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)

            video_count = 1
            page_time = 0
            first_video_time = "0"
            need_make_video_dir = True
            is_over = False
            while not is_over:
                # 获取一页视频信息
                video_data = get_one_page_video_data(account_id, page_time)
                if video_data is None:
                    log.error(account_name + " 视频列表获取失败")
                    tool.process_exit()

                for video_info in video_data["info"]:
                    if not robot.check_sub_key(("newvideos", "id", "timestamp"), video_info):
                        log.error(account_name + " 视频信息 %s 解析失败" % video_info)
                        continue

                    page_time = int(video_info["timestamp"])
                    # 检查是否已下载到前一次的视频
                    if page_time <= int(self.account_info[2]):
                        is_over = True
                        break

                    # 将第一个视频的上传时间做为新的存档记录
                    if first_video_time == "0":
                        first_video_time = str(page_time)

                    # todo 处理如果有多个视频
                    if len(video_info["newvideos"]) != 1:
                        log.error(account_name + " 视频信息 %s 发现多个视频下载信息" % video_info)
                        continue
                    if not robot.check_sub_key(("vid",), video_info["newvideos"][0]):
                        log.error(account_name + " 视频信息 %s 解析vid失败" % video_info)
                        continue

                    # 获取视频下载地址
                    video_url = get_video_url(video_info["newvideos"][0]["vid"], video_info["id"])
                    log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_url))

                    # 第一个视频，创建目录
                    if need_make_video_dir:
                        if not tool.make_dir(video_path, 0):
                            log.error(account_name + " 创建图片下载目录 %s 失败" % video_path)
                            tool.process_exit()
                        need_make_video_dir = False

                    file_type = video_url.split(".")[-1].split("?")[0]
                    file_path = os.path.join(video_path, "%04d.%s" % (video_count, file_type))
                    if tool.save_net_file(video_url, file_path):
                        log.step(account_name + " 第%s个视频下载成功" % video_count)
                        video_count += 1
                    else:
                        log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url))

                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_VIDEO_COUNT < video_data:
                        is_over = True
                        break

                if not is_over:
                    if not video_data["hasNext"]:
                        is_over = True

            log.step(account_name + " 下载完毕，总共获得%s个视频" % (video_count - 1))

            # 排序
            if IS_SORT:
                if first_video_time != "0":
                    destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(video_path, destination_path, int(self.account_info[3]), 4):
                        log.step(account_name + " 视频从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            # 新的存档记录
            if first_video_time != "0":
                self.account_info[3] = str(int(self.account_info[3]) + video_count - 1)
                self.account_info[4] = first_video_time

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #27

파일 보기

파일: article.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 3 and self.account_info[2]:
            account_name = self.account_info[2]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 获取账号对应的page_id
            account_page_id = get_account_page_id(account_id)
            if account_page_id is None:
                log.error(account_name + " 微博主页没有获取到page_id")
                tool.process_exit()

            page_count = 1
            this_account_total_image_count = 0
            first_article_time = "0"
            is_over = False
            image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
            while not is_over:
                # 获取一页文章预览页面
                preview_article_page = get_one_page_preview_article_data(account_page_id, page_count)

                if preview_article_page is None:
                    log.error(account_name + " 第%s页文章获取失败" % page_count)
                    tool.process_exit()

                # 将文章预览页面内容分组
                preview_article_data_list = get_preview_article_data_list(preview_article_page)
                if len(preview_article_data_list) == 0:
                    log.error(account_name + " 第%s页文章解析失败，页面：%s" % (page_count, preview_article_page))
                    tool.process_exit()

                for preview_article_data in preview_article_data_list:
                    # 获取文章的发布时间
                    article_time = get_article_time(preview_article_data)
                    if article_time is None:
                        log.error(account_name + " 预览 %s 中的文章发布时间解析失败" % preview_article_data)
                        continue

                    # 检查是否是上一次的最后视频
                    if article_time <= int(self.account_info[1]):
                        is_over = True
                        break

                    # 将第一个视频的地址做为新的存档记录
                    if first_article_time == "0":
                        first_article_time = str(article_time)

                    # 获取文章地址
                    article_url = get_article_url(preview_article_data)
                    if article_url is None:
                        log.error(account_name + " 预览 %s 中的文章地址解析失败" % preview_article_data)
                        continue

                    # 获取文章id
                    article_id = get_article_id(article_url)
                    if article_id is None:
                        log.error(account_name + " 文章地址 %s 解析文章id失败" % article_url)
                        continue

                    # 获取文章页面内容
                    article_page = auto_redirect_visit(article_url)
                    if not article_page:
                        log.error(account_name + " 文章 %s 获取失败" % article_url)
                        continue

                    # 文章标题
                    title = get_article_title(article_page, article_id[0])
                    # 过滤标题中不支持的字符
                    title = robot.filter_text(title)
                    if title:
                        article_path = os.path.join(image_path, "%s %s" % (article_id, title))
                    else:
                        article_path = os.path.join(image_path, article_id)
                    if not tool.make_dir(article_path, 0):
                        # 目录出错，把title去掉后再试一次，如果还不行退出
                        log.error(account_name + " 创建文章目录 %s 失败，尝试不使用title" % article_path)
                        article_path = os.path.join(image_path, article_id)
                        if not tool.make_dir(article_path, 0):
                            log.error(account_name + " 创建文章目录 %s 失败" % article_path)
                            tool.process_exit()

                    # 文章顶部图片
                    top_picture_url = get_article_top_picture_url(article_page)
                    if top_picture_url:
                        log.step(account_name + " %s 开始下载顶部图片 %s" % (title, top_picture_url))

                        file_type = top_picture_url.split(".")[-1]
                        file_path = os.path.join(article_path, "0000.%s" % file_type)
                        if tool.save_net_file(top_picture_url, file_path):
                            log.step(account_name + " %s 顶部图片下载成功" % title)
                            this_account_total_image_count += 1
                        else:
                            log.error(account_name + " %s 顶部图片 %s 下载失败" % (title, top_picture_url))

                    # 获取文章正文的图片地址列表
                    image_url_list = get_article_image_url_list(article_page, article_id[0])
                    if image_url_list is None:
                        log.error(account_name + " 文章 %s 正文解析失败" % article_url)
                        continue

                    image_count = 1
                    for image_url in list(image_url_list):
                        if image_url.find("/p/e_weibo_com") >= 0 or image_url.find("e.weibo.com") >= 0:
                            continue
                        log.step(account_name + " %s 开始下载第%s张图片 %s" % (title, image_count, image_url))

                        file_type = image_url.split(".")[-1]
                        file_path = os.path.join(article_path, "%s.%s" % (image_count, file_type))
                        if tool.save_net_file(image_url, file_path):
                            log.step(account_name + " %s 第%s张图片下载成功" % (title, image_count))
                            image_count += 1
                        else:
                            log.error(account_name + " %s 第%s张图片 %s 下载失败" % (title, image_count, image_url))

                    if image_count > 1:
                        this_account_total_image_count += image_count - 1

                if not is_over:
                    # 获取文章总页数
                    if page_count >= get_max_page_count(preview_article_page):
                        is_over = True
                    else:
                        page_count += 1

            log.step(account_name + " 下载完毕，总共获得%s张图片" % this_account_total_image_count)

            # 新的存档记录
            if first_article_time != "0":
                self.account_info[1] = first_article_time

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += this_account_total_image_count
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #28

파일 보기

파일: crawler.py 프로젝트: yxw19870806/Py3Crawler

    def __init__(self, sys_config, **kwargs):
        """
        :Args:
        - sys_config
            - download_photo - 程序是否支持下载图片功能，默认值：False
            - download_video - 程序是否支持下载视频功能，默认值：False
            - download_audio - 程序是否支持下载音频功能，默认值：False
            - download_content - 程序是否支持下载文本内容功能，默认值：False
            - set_proxy - 程序是否默认需要设置代理，默认值：False
            - no_save_data - 程序是否支持不需要存档文件就可以开始运行，默认值：False
            - no_download - 程序没有任何下载行为，默认值：False
            - get_cookie - 程序是否需要从浏览器存储的cookie中获取指定cookie的值，默认值：False
            - app_config - 程序额外应用配置，存在相同配置参数时将会将其他值覆盖
            - app_config_path - 程序默认的app配置文件路径，赋值后将不会读取原本的app.ini文件
        - kwargs
            - extra_sys_config - 通过类实例化时传入的程序配置
            - extra_app_config - 通过类实例化时传入的应用配置
        """
        self.start_time = time.time()

        # 程序启动配置
        if not isinstance(sys_config, dict):
            output.print_msg("程序启动配置不存在，请检查代码！")
            tool.process_exit()
            return
        # 额外初始化配置（直接通过实例化中传入，可覆盖子类__init__方法传递的sys_config参数）
        if "extra_sys_config" in kwargs and isinstance(
                kwargs["extra_sys_config"], dict):
            sys_config.update(kwargs["extra_sys_config"])
        sys_download_photo = SYS_DOWNLOAD_PHOTO in sys_config and sys_config[
            SYS_DOWNLOAD_PHOTO]
        sys_download_video = SYS_DOWNLOAD_VIDEO in sys_config and sys_config[
            SYS_DOWNLOAD_VIDEO]
        sys_download_audio = SYS_DOWNLOAD_AUDIO in sys_config and sys_config[
            SYS_DOWNLOAD_AUDIO]
        sys_download_content = SYS_DOWNLOAD_CONTENT in sys_config and sys_config[
            SYS_DOWNLOAD_CONTENT]
        sys_set_proxy = SYS_SET_PROXY in sys_config and sys_config[
            SYS_SET_PROXY]
        sys_get_cookie = SYS_GET_COOKIE in sys_config and sys_config[
            SYS_GET_COOKIE]
        sys_not_check_save_data = SYS_NOT_CHECK_SAVE_DATA in sys_config and sys_config[
            SYS_NOT_CHECK_SAVE_DATA]
        sys_not_download = SYS_NOT_DOWNLOAD in sys_config and sys_config[
            SYS_NOT_DOWNLOAD]

        # exe程序
        if tool.IS_EXECUTABLE:
            application_path = os.path.dirname(sys.executable)
            os.chdir(application_path)
            config_path = os.path.join(os.getcwd(), "data/config.ini")
        else:
            config_path = PROJECT_CONFIG_PATH

        # 程序配置
        config = read_config(config_path)
        # 应用配置
        if SYS_APP_CONFIG_PATH in sys_config:
            app_config_path = sys_config[SYS_APP_CONFIG_PATH]
        else:
            app_config_path = os.path.abspath(
                os.path.join(PROJECT_APP_PATH, "app.ini"))
        if os.path.exists(app_config_path):
            config.update(read_config(app_config_path))
        # 额外应用配置（直接通过实例化中传入，可覆盖配置文件中参数）
        if "extra_app_config" in kwargs and isinstance(
                kwargs["extra_app_config"], dict):
            config.update(kwargs["extra_app_config"])

        # 应用配置
        self.app_config = {}
        if SYS_APP_CONFIG in sys_config and len(
                sys_config[SYS_APP_CONFIG]) > 0:
            for app_config_template in sys_config[SYS_APP_CONFIG]:
                if len(app_config_template) == 3:
                    self.app_config[app_config_template[0]] = analysis_config(
                        config, app_config_template[0], app_config_template[1],
                        app_config_template[2])

        # 是否下载
        self.is_download_photo = analysis_config(
            config, "IS_DOWNLOAD_PHOTO", True,
            CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_photo
        self.is_download_video = analysis_config(
            config, "IS_DOWNLOAD_VIDEO", True,
            CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_video
        self.is_download_audio = analysis_config(
            config, "IS_DOWNLOAD_AUDIO", True,
            CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_audio
        self.is_download_content = analysis_config(
            config, "IS_DOWNLOAD_CONTENT", True,
            CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_content

        if not sys_not_download and not self.is_download_photo and not self.is_download_video and not self.is_download_audio and not self.is_download_content:
            if sys_download_photo or sys_download_video or sys_download_audio or sys_download_content:
                output.print_msg("所有支持的下载都没有开启，请检查配置！")
                tool.process_exit()
                return

        # 下载文件时是否覆盖已存在的同名文件
        net.DOWNLOAD_REPLACE_IF_EXIST = analysis_config(
            config, "IS_DOWNLOAD_REPLACE_IF_EXIST", False,
            CONFIG_ANALYSIS_MODE_BOOLEAN)

        # 存档
        self.save_data_path = analysis_config(config, "SAVE_DATA_PATH",
                                              "\\\\info/save.data",
                                              CONFIG_ANALYSIS_MODE_PATH)
        self.temp_save_data_path = ""
        if not sys_not_check_save_data:
            if not os.path.exists(self.save_data_path):
                # 存档文件不存在
                output.print_msg(f"存档文件{self.save_data_path}不存在！")
                tool.process_exit()
                return
            temp_file_name = tool.get_time("%m-%d_%H_%M_") + os.path.basename(
                self.save_data_path)
            self.temp_save_data_path = os.path.join(
                os.path.dirname(self.save_data_path), temp_file_name)
            if os.path.exists(self.temp_save_data_path):
                # 临时文件已存在
                output.print_msg(f"存档临时文件{self.temp_save_data_path}已存在！")
                tool.process_exit()
                return

        # cache
        self.cache_data_path = analysis_config(config, "CACHE_DATA_PATH",
                                               "\\\\cache",
                                               CONFIG_ANALYSIS_MODE_PATH)

        # session
        self.session_data_path = analysis_config(config, "SESSION_DATA_PATH",
                                                 "\\\\info/session.data",
                                                 CONFIG_ANALYSIS_MODE_PATH)

        # 是否需要下载图片
        if self.is_download_photo:
            # 图片保存目录
            self.photo_download_path = analysis_config(
                config, "PHOTO_DOWNLOAD_PATH", "\\\\photo",
                CONFIG_ANALYSIS_MODE_PATH)
        else:
            self.photo_download_path = ""
        # 是否需要下载视频
        if self.is_download_video:
            # 视频保存目录
            self.video_download_path = analysis_config(
                config, "VIDEO_DOWNLOAD_PATH", "\\\\video",
                CONFIG_ANALYSIS_MODE_PATH)
        else:
            self.video_download_path = ""
        # 是否需要下载音频
        if self.is_download_audio:
            # 音频保存目录
            self.audio_download_path = analysis_config(
                config, "AUDIO_DOWNLOAD_PATH", "\\\\audio",
                CONFIG_ANALYSIS_MODE_PATH)
        else:
            self.audio_download_path = ""
        # 是否需要下载文本内容
        if self.is_download_content:
            # 音频保存目录
            self.content_download_path = analysis_config(
                config, "CONTENT_DOWNLOAD_PATH", "\\\\content",
                CONFIG_ANALYSIS_MODE_PATH)
        else:
            self.content_download_path = ""

        # 是否在下载失败后退出线程的运行
        self.is_thread_exit_after_download_failure = analysis_config(
            config, "IS_THREAD_EXIT_AFTER_DOWNLOAD_FAILURE", "\\\\content",
            CONFIG_ANALYSIS_MODE_BOOLEAN)

        # 代理
        is_proxy = analysis_config(config, "IS_PROXY", 2,
                                   CONFIG_ANALYSIS_MODE_INTEGER)
        if is_proxy == 1 or (is_proxy == 2 and sys_set_proxy):
            proxy_ip = analysis_config(config, "PROXY_IP", "127.0.0.1")
            proxy_port = analysis_config(config, "PROXY_PORT", "8087")
            # 使用代理的线程池
            net.set_proxy(proxy_ip, proxy_port)
        else:
            # 初始化urllib3的线程池
            net.init_http_connection_pool()

        # cookies
        self.cookie_value = {}
        if sys_get_cookie:
            # 操作系统&浏览器
            browser_type = analysis_config(config, "BROWSER_TYPE", 2,
                                           CONFIG_ANALYSIS_MODE_INTEGER)
            # cookie
            cookie_path = analysis_config(config, "COOKIE_PATH", "",
                                          CONFIG_ANALYSIS_MODE_RAW)
            if cookie_path:
                cookie_path = analysis_config(config, "COOKIE_PATH", "",
                                              CONFIG_ANALYSIS_MODE_PATH)
            else:
                cookie_path = browser.get_default_browser_cookie_path(
                    browser_type)
            all_cookie_from_browser = browser.get_all_cookie_from_browser(
                browser_type, cookie_path)
            if browser_type == browser.BROWSER_TYPE_TEXT:
                if "DEFAULT" in all_cookie_from_browser:
                    self.cookie_value.update(
                        all_cookie_from_browser["DEFAULT"])
            else:
                for cookie_domain in sys_config[SYS_GET_COOKIE]:
                    check_domain_list = [cookie_domain]
                    if cookie_domain[0] != ".":
                        check_domain_list.append("." + cookie_domain)
                    elif cookie_domain[0] == ".":
                        check_domain_list.append(cookie_domain[1:])
                    for check_domain in check_domain_list:
                        if check_domain in all_cookie_from_browser:
                            self.cookie_value.update(
                                all_cookie_from_browser[check_domain])

        # 线程数
        self.thread_count = analysis_config(config, "THREAD_COUNT", 10,
                                            CONFIG_ANALYSIS_MODE_INTEGER)
        self.thread_lock = threading.Lock()  # 线程锁，避免操作一些全局参数
        self.thread_semaphore = threading.Semaphore(
            self.thread_count)  # 线程总数信号量

        # 启用线程监控是否需要暂停其他下载线程
        if analysis_config(config, "IS_PORT_LISTENER_EVENT", False,
                           CONFIG_ANALYSIS_MODE_BOOLEAN):
            listener_port = analysis_config(config, "LISTENER_PORT", 12345,
                                            CONFIG_ANALYSIS_MODE_INTEGER)
            listener_event_bind = {
                str(portListenerEvent.PROCESS_STATUS_PAUSE):
                net.pause_request,  # 暂停进程
                str(portListenerEvent.PROCESS_STATUS_RUN):
                net.resume_request,  # 继续进程
                str(portListenerEvent.PROCESS_STATUS_STOP):
                self.stop_process  # 结束进程（取消当前的线程，完成任务）
            }
            process_control_thread = portListenerEvent.PortListenerEvent(
                port=listener_port, event_list=listener_event_bind)
            process_control_thread.setDaemon(True)
            process_control_thread.start()

        # 键盘监控线程（仅支持windows）
        if platform.system() == "Windows" and analysis_config(
                config, "IS_KEYBOARD_EVENT", False,
                CONFIG_ANALYSIS_MODE_BOOLEAN):
            keyboard_event_bind = {}
            pause_process_key = analysis_config(config,
                                                "PAUSE_PROCESS_KEYBOARD_KEY",
                                                "F9")
            # 暂停进程
            if pause_process_key:
                keyboard_event_bind[pause_process_key] = self.pause_process
            # 继续进程
            continue_process_key = analysis_config(
                config, "CONTINUE_PROCESS_KEYBOARD_KEY", "F10")
            if continue_process_key:
                keyboard_event_bind[continue_process_key] = self.resume_process
            # 结束进程（取消当前的线程，完成任务）
            stop_process_key = analysis_config(config,
                                               "STOP_PROCESS_KEYBOARD_KEY",
                                               "CTRL + F12")
            if stop_process_key:
                keyboard_event_bind[stop_process_key] = self.stop_process

            if keyboard_event_bind:
                keyboard_control_thread = keyboardEvent.KeyboardEvent(
                    keyboard_event_bind)
                keyboard_control_thread.setDaemon(True)
                keyboard_control_thread.start()

        self.save_data = {}
        self.total_photo_count = 0
        self.total_video_count = 0
        self.total_audio_count = 0

        output.print_msg("初始化完成")

예제 #29

파일 보기

파일: nicoNico.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_VIDEO_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 3 and self.account_info[2]:
            account_name = self.account_info[2]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                video_path = os.path.join(VIDEO_TEMP_PATH, account_name)
            else:
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)

            # 获取视频信息列表
            video_info_list = get_video_info_list(account_id)
            if video_info_list is None:
                log.error(account_name + " 视频列表获取失败")
                tool.process_exit()

            video_count = 1
            first_video_id = "0"
            need_make_video_dir = True
            for video_info in video_info_list:
                if not robot.check_sub_key(("item_data",), video_info) or \
                        not robot.check_sub_key(("watch_id", "title"), video_info["item_data"]):
                    log.error(account_name + " 视频信息%s解析失败" % video_info)
                    tool.process_exit()

                # sm30043563
                video_id = str(video_info["item_data"]["watch_id"])

                # 过滤标题中不支持的字符
                video_title = robot.filter_text(video_info["item_data"]["title"])

                # 第一个视频，创建目录
                if need_make_video_dir:
                    if not tool.make_dir(video_path, 0):
                        log.error(account_name + " 创建图片下载目录 %s 失败" % video_path)
                        tool.process_exit()
                    need_make_video_dir = False

                # 获取视频下载地址
                video_url = get_video_url(video_id)
                log.step(account_name + " 开始下载第%s个视频 %s %s" % (video_count, video_id, video_url))
                print video_title
                print "%s %s" % (video_id, video_title)
                file_path = os.path.join(video_path, "%s %s.mp4" % (video_id, video_title))
                if tool.save_net_file(video_url, file_path):
                    log.step(account_name + " 第%s个视频下载成功" % video_count)
                    video_count += 1
                else:
                    log.error(account_name + " 第%s个视频 %s %s 下载失败" % (video_count, video_id, video_url))

            log.step(account_name + " 下载完毕，总共获得%s个视频" % (video_count - 1))

            # 排序
            if IS_SORT:
                if first_video_id != "0":
                    destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(video_path, destination_path, int(self.account_info[3]), 4):
                        log.step(account_name + " 视频从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            # 新的存档记录
            if first_video_id != "0":
                self.account_info[1] = first_video_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #30

파일 보기

    def __init__(self, sys_config, extra_config=None):
        global IS_INIT
        self.start_time = time.time()

        # 程序启动配置
        if not isinstance(sys_config, dict):
            self.print_msg("程序启动配置不存在，请检查代码！")
            tool.process_exit()
            return
        sys_download_image = SYS_DOWNLOAD_IMAGE in sys_config
        sys_download_video = SYS_DOWNLOAD_VIDEO in sys_config
        sys_set_proxy = SYS_SET_PROXY in sys_config
        sys_get_cookie = SYS_GET_COOKIE in sys_config
        sys_not_check_save_data = SYS_NOT_CHECK_SAVE_DATA in sys_config

        # exe程序
        if tool.IS_EXECUTABLE:
            application_path = os.path.dirname(sys.executable)
            os.chdir(application_path)
            config_path = os.path.join(os.getcwd(), "data/config.ini")
        else:
            config_path = tool.PROJECT_CONFIG_PATH

        # 程序配置
        config = read_config(config_path)
        if not isinstance(extra_config, dict):
            extra_config = {}

        # 应用配置
        self.app_config = {}
        if SYS_APP_CONFIG in sys_config and len(sys_config[SYS_APP_CONFIG]) >= 2:
            app_config = read_config(sys_config[SYS_APP_CONFIG][0])
            for app_config_template in  sys_config[SYS_APP_CONFIG][1:]:
                if len(app_config_template) == 3:
                    self.app_config[app_config_template[0]] = get_config(app_config, app_config_template[0], app_config_template[1], app_config_template[2])

        # 日志
        self.is_show_error = get_config(config, "IS_SHOW_ERROR", True, 2)
        self.is_show_step = get_config(config, "IS_SHOW_STEP", True, 2)
        self.is_show_trace = get_config(config, "IS_SHOW_TRACE", False, 2)
        error_log_path = get_config(config, "ERROR_LOG_PATH", "\\log/errorLog.txt", 3)
        self.error_log_path = replace_path(error_log_path)
        error_log_dir = os.path.dirname(self.error_log_path)

        if not tool.make_dir(error_log_dir, 0):
            self.print_msg("创建错误日志目录 %s 失败" % error_log_dir)
            tool.process_exit()
            return
        is_log_step = get_config(config, "IS_LOG_STEP", True, 2)
        if not is_log_step:
            self.step_log_path = ""
        else:
            step_log_path = get_config(config, "STEP_LOG_PATH", "\\log/stepLog.txt", 3)
            self.step_log_path = replace_path(step_log_path)
            # 日志文件保存目录
            step_log_dir = os.path.dirname(self.step_log_path)
            if not tool.make_dir(step_log_dir, 0):
                self.print_msg("创建步骤日志目录 %s 失败" % step_log_dir)
                tool.process_exit()
                return
        is_log_trace = get_config(config, "IS_LOG_TRACE", True, 2)
        if not is_log_trace:
            self.trace_log_path = ""
        else:
            trace_log_path = get_config(config, "TRACE_LOG_PATH", "\\log/traceLog.txt", 3)
            self.trace_log_path = replace_path(trace_log_path)
            # 日志文件保存目录
            trace_log_dir = os.path.dirname(self.trace_log_path)
            if not tool.make_dir(trace_log_dir, 0):
                self.print_msg("创建调试日志目录 %s 失败" % trace_log_dir)
                tool.process_exit()
                return

        if not IS_INIT:
            log.IS_SHOW_ERROR = self.is_show_error
            log.IS_SHOW_STEP = self.is_show_step
            log.IS_SHOW_TRACE = self.is_show_trace
            log.ERROR_LOG_PATH = self.error_log_path
            log.STEP_LOG_PATH = self.step_log_path
            log.TRACE_LOG_PATH = self.trace_log_path
            IS_INIT = True

        # 是否下载
        self.is_download_image = get_config(config, "IS_DOWNLOAD_IMAGE", True, 2) and sys_download_image
        self.is_download_video = get_config(config, "IS_DOWNLOAD_VIDEO", True, 2) and sys_download_video

        if not self.is_download_image and not self.is_download_video:
            if sys_download_image or sys_download_video:
                self.print_msg("所有支持的下载都没有开启，请检查配置！")
                tool.process_exit()
                return

        # 存档
        if "save_data_path" in extra_config:
            self.save_data_path = os.path.realpath(extra_config["save_data_path"])
        else:
            self.save_data_path = get_config(config, "SAVE_DATA_PATH", "\\\\info/save.data", 3)
        if not sys_not_check_save_data and not os.path.exists(self.save_data_path):
            # 存档文件不存在
            self.print_msg("存档文件%s不存在！" % self.save_data_path)
            tool.process_exit()
            return

        # 是否需要下载图片
        if self.is_download_image:
            # 图片保存目录
            if "image_download_path" in extra_config:
                self.image_download_path = os.path.realpath(extra_config["image_download_path"])
            else:
                self.image_download_path = get_config(config, "IMAGE_DOWNLOAD_PATH", "\\\\photo", 3)
            if not tool.make_dir(self.image_download_path, 0):
                # 图片保存目录创建失败
                self.print_msg("图片保存目录%s创建失败！" % self.image_download_path)
                tool.process_exit()
                return
        else:
            self.image_download_path = ""
        # 是否需要下载视频
        if self.is_download_video:
            # 视频保存目录
            if "video_download_path" in extra_config:
                self.video_download_path = os.path.realpath(extra_config["video_download_path"])
            else:
                self.video_download_path = get_config(config, "VIDEO_DOWNLOAD_PATH", "\\\\video", 3)
            if not tool.make_dir(self.video_download_path, 0):
                # 视频保存目录创建失败
                self.print_msg("视频保存目录%s创建失败！" % self.video_download_path)
                tool.process_exit()
                return
        else:
            self.video_download_path = ""

        # 代理
        is_proxy = get_config(config, "IS_PROXY", 2, 1)
        if is_proxy == 1 or (is_proxy == 2 and sys_set_proxy):
            proxy_ip = get_config(config, "PROXY_IP", "127.0.0.1", 0)
            proxy_port = get_config(config, "PROXY_PORT", "8087", 0)
            # 使用代理的线程池
            net.set_proxy(proxy_ip, proxy_port)
        else:
            # 初始化urllib3的线程池
            net.init_http_connection_pool()

        # cookies
        self.cookie_value = {}
        if sys_get_cookie:
            # 操作系统&浏览器
            browser_type = get_config(config, "BROWSER_TYPE", 2, 1)
            # cookie
            is_auto_get_cookie = get_config(config, "IS_AUTO_GET_COOKIE", True, 2)
            if is_auto_get_cookie:
                cookie_path = tool.get_default_browser_cookie_path(browser_type)
            else:
                cookie_path = get_config(config, "COOKIE_PATH", "", 0)
            all_cookie_from_browser = tool.get_all_cookie_from_browser(browser_type, cookie_path)
            for cookie_domain in sys_config[SYS_GET_COOKIE]:
                # 如果指定了cookie key
                if sys_config[SYS_GET_COOKIE][cookie_domain]:
                    for cookie_key in sys_config[SYS_GET_COOKIE][cookie_domain]:
                        self.cookie_value[cookie_key] = ""
                    if cookie_domain in all_cookie_from_browser:
                        for cookie_name in self.cookie_value:
                            if cookie_name in all_cookie_from_browser[cookie_domain]:
                                self.cookie_value[cookie_name] = all_cookie_from_browser[cookie_domain][cookie_name]
                # 没有指定cookie key那么就是取全部
                else:
                    if cookie_domain in all_cookie_from_browser:
                        for cookie_name in all_cookie_from_browser[cookie_domain]:
                            self.cookie_value[cookie_name] = all_cookie_from_browser[cookie_domain][cookie_name]

        # Http Setting
        net.HTTP_CONNECTION_TIMEOUT = get_config(config, "HTTP_CONNECTION_TIMEOUT", 10, 1)
        net.HTTP_REQUEST_RETRY_COUNT = get_config(config, "HTTP_REQUEST_RETRY_COUNT", 10, 1)

        # 线程数
        self.thread_count = get_config(config, "THREAD_COUNT", 10, 1)
        self.thread_lock = threading.Lock()

        # 启用线程监控是否需要暂停其他下载线程
        process_control_thread = process.ProcessControl()
        process_control_thread.setDaemon(True)
        process_control_thread.start()

        # 键盘监控线程
        if get_config(config, "IS_KEYBOARD_EVENT", True, 2):
            keyboard_event_bind = {}
            pause_process_key = get_config(config, "PAUSE_PROCESS_KEYBOARD_KEY", "F9", 0)
            # 暂停进程
            if pause_process_key:
                keyboard_event_bind[pause_process_key] = process.pause_process
            # 继续进程
            continue_process_key = get_config(config, "CONTINUE_PROCESS_KEYBOARD_KEY", "F10", 0)
            if continue_process_key:
                keyboard_event_bind[continue_process_key] = process.continue_process
            # 结束进程（取消当前的线程，完成任务）
            stop_process_key = get_config(config, "STOP_PROCESS_KEYBOARD_KEY", "CTRL + F12", 0)
            if stop_process_key:
                keyboard_event_bind[stop_process_key] = process.stop_process

            if keyboard_event_bind:
                keyboard_control_thread = keyboardEvent.KeyboardEvent(keyboard_event_bind)
                keyboard_control_thread.setDaemon(True)
                keyboard_control_thread.start()

        self.print_msg("初始化完成")

예제 #31

파일 보기

파일: blog.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 4 and self.account_info[3]:
            account_name = self.account_info[3]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)

            # 图片
            image_count = 1
            page_count = 1
            first_blog_id = "0"
            need_make_image_dir = True
            is_over = False
            is_big_image_over = False
            while not is_over:
                # 获取一页日志信息
                blog_page = get_one_page_blog(account_id, page_count)
                if blog_page is None:
                    log.error(account_name + " 第%s页日志获取失败" % page_count)
                    tool.process_exit()
                if not blog_page:
                    log.error(account_name + " 第%s页日志解析失败" % page_count)
                    tool.process_exit()

                blog_data_list = get_blog_data_list(blog_page)
                if len(blog_data_list) == 0:
                    log.error(account_name + " 第%s页日志分组失败" % page_count)
                    tool.process_exit()

                for blog_data in blog_data_list:
                    # 获取日志id
                    blog_id = get_blog_id(account_id, blog_data)
                    if blog_id is None:
                        log.error(account_name + " 日志解析日志id失败，日志内容：%s" % blog_data)
                        tool.process_exit()

                    # 检查是否已下载到前一次的日志
                    if blog_id <= int(self.account_info[2]):
                        is_over = True
                        break

                    # 将第一个日志的ID做为新的存档记录
                    if first_blog_id == "0":
                        first_blog_id = str(blog_id)

                    # 获取该页日志的全部图片地址列表
                    image_url_list = get_image_url_list(blog_data)
                    if len(image_url_list) == 0:
                        continue

                    # 获取日志页面中存在的所有大图显示地址，以及对应的小图地址
                    big_2_small_list = get_big_image_url_list(blog_data)

                    # 下载图片
                    for image_url in image_url_list:
                        # 检查是否存在大图可以下载
                        if not is_big_image_over:
                            image_url, is_big_image_over = check_big_image(image_url, big_2_small_list)
                        log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url))

                        # 第一张图片，创建目录
                        if need_make_image_dir:
                            if not tool.make_dir(image_path, 0):
                                log.error(account_name + " 创建图片下载目录 %s 失败" % image_path)
                                tool.process_exit()
                            need_make_image_dir = False

                        file_type = image_url.split(".")[-1]
                        if file_type.find("?") != -1:
                            file_type = "jpeg"
                        file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                        if tool.save_net_file(image_url, file_path):
                            log.step(account_name + " 第%s张图片下载成功" % image_count)
                            image_count += 1
                        else:
                            log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url))

                        # 达到配置文件中的下载数量，结束
                        if 0 < GET_IMAGE_COUNT < image_count:
                            is_over = True
                            break

                if not is_over:
                    # 达到配置文件中的下载页数，结束
                    if 0 < GET_PAGE_COUNT <= page_count:
                        is_over = True
                    # 判断当前页数是否大等于总页数
                    elif page_count >= get_max_page_count(blog_page):
                        is_over = True
                    else:
                        page_count += 1

            log.step(account_name + " 下载完毕，总共获得%s张图片" % (image_count - 1))

            # 排序
            if IS_SORT:
                if first_blog_id != "0":
                    destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                        log.step(account_name + " 图片从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            # 新的存档记录
            if first_blog_id != "0":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = first_blog_id

            # 保存最后的信息
            self.thread_lock.acquire()
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            TOTAL_IMAGE_COUNT += image_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #32

파일 보기

파일: template.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT
        global TOTAL_VIDEO_COUNT

        account_id = self.account_info[0]
        # todo 是否有需要显示不同名字
        account_name = account_id

        try:
            log.step(account_name + " 开始")

            # todo 是否需要下载图片或视频
            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
                video_path = os.path.join(VIDEO_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)

            # todo 图片下载逻辑
            # 图片
            image_count = 1
            first_image_time = "0"
            need_make_image_dir = True
            if IS_DOWNLOAD_IMAGE:
                pass

            # todo 视频下载逻辑
            # 视频
            video_count = 1
            first_video_time = "0"
            need_make_video_dir = True
            if IS_DOWNLOAD_VIDEO:
                pass

            log.step(account_name + " 下载完毕，总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1))

            # 排序
            if IS_SORT:
                # todo 是否需要下载图片
                if first_image_time != "0":
                    destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                        log.step(account_name + " 图片从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path)
                        tool.process_exit()
                # todo 是否需要下载视频
                if first_video_time != "0":
                    destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(video_path, destination_path, int(self.account_info[3]), 4):
                        log.step(account_name + " 视频从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            # todo 是否需要下载图片或视频
            # 新的存档记录
            if first_image_time != "0":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = first_image_time
            if first_video_time != "0":
                self.account_info[3] = str(int(self.account_info[3]) + video_count - 1)
                self.account_info[4] = first_video_time

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            # todo 是否需要下载图片或视频
            TOTAL_IMAGE_COUNT += image_count - 1
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #33

파일 보기

파일: googlePlus.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 4 and self.account_info[3]:
            account_name = self.account_info[3]
        else:
            account_name = self.account_info[0]
        if len(self.account_info) >= 5 and self.account_info[4]:
            account_file_path = self.account_info[4]
        else:
            account_file_path = ""

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_file_path, account_name)

            # 图片下载
            image_count = 1
            key = ""
            first_album_id = "0"
            unique_list = []
            is_over = False
            need_make_download_dir = True
            while not is_over:
                # 获取一页相册
                album_page = get_one_page_album(account_id, key)
                if album_page is None:
                    log.error(account_name + " 无法访问相册页，token：%s" % key)
                    tool.process_exit()

                # 获取相册页中的所有picasweb地址列表
                picasaweb_url_list = get_picasaweb_url_list(album_page)

                log.trace(account_name + " 相册获取的所有picasaweb页：%s" % picasaweb_url_list)
                for picasaweb_url in picasaweb_url_list:
                    # 有可能拿到带authkey的，需要去掉
                    # https://picasaweb.google.com/116300481938868290370/2015092603?authkey\u003dGv1sRgCOGLq-jctf-7Ww#6198800191175756402
                    picasaweb_url = picasaweb_url.replace("\u003d", "=")

                    # 获取picasaweb页的album id
                    album_id = get_picasaweb_page_album_id(account_id, picasaweb_url)
                    if album_id is None:
                        log.error(account_name + " 第%s张图片，无法访问picasaweb页 %s" % (image_count, picasaweb_url))
                        continue
                    if not album_id:
                        log.error(account_name + " 第%s张图片，picasaweb页 %s 获取album id失败" % (image_count, picasaweb_url))
                        continue
                    log.trace(account_name + " picasaweb页 %s 的album id：%s" % (picasaweb_url, album_id))

                    # 检查是否已下载到前一次的图片
                    if int(album_id) <= int(self.account_info[2]):
                        is_over = True
                        break

                    # # 相同的album_id判断
                    if album_id in unique_list:
                        continue
                    else:
                        unique_list.append(album_id)
                    # 将第一个album_id做为新的存档记录
                    if first_album_id == "0":
                        first_album_id = album_id

                    # 获取album id对应相册存档页的全部图片地址列表
                    image_url_list = get_image_url_list(account_id, album_id)
                    if image_url_list is None:
                        log.error(account_name + " 第%s张图片，无法访问album id：%s 的相册存档页" % (image_count, album_id))
                        continue
                    if len(image_url_list) == 0:
                        log.error(account_name + " 第%s张图片，album id：%s 的相册存档页没有解析到图片" % (image_count, album_id))
                        continue

                    log.trace(account_name + " album id：%s 的相册存档页获取的所有图片：%s" % (album_id, image_url_list))
                    for image_url in list(image_url_list):
                        image_url = generate_max_resolution_image_url(image_url)

                        # 下载
                        log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url))

                        # 第一张图片，创建目录
                        if need_make_download_dir:
                            if not tool.make_dir(image_path, 0):
                                log.error(account_name + " 创建图片下载目录 %s 失败" % image_path)
                                tool.process_exit()
                            need_make_download_dir = False

                        if image_url.rfind("/") < image_url.rfind("."):
                            file_type = image_url.split(".")[-1]
                        else:
                            file_type = "jpg"
                        file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                        if tool.save_net_file(image_url, file_path):
                            log.step(account_name + " 第%s张图片下载成功" % image_count)
                            image_count += 1
                        else:
                            log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url))

                        # 达到配置文件中的下载数量，结束
                        if 0 < GET_IMAGE_COUNT < image_count:
                            is_over = True
                            break
                    if is_over:
                        break

                if not is_over:
                    # 查找下一页的token key
                    key_find = re.findall('"([.]?[a-zA-Z0-9-_]*)"', album_page)
                    if len(key_find) > 0 and len(key_find[0]) > 80:
                        key = key_find[0]
                    else:
                        # 不是第一次下载
                        if self.account_info[2] != "0":
                            log.error(account_name + " 没有找到下一页的token，将该页保存：")
                            log.error(album_page)
                        is_over = True

            log.step(account_name + " 下载完毕，总共获得%s张图片" % (image_count - 1))

            # 排序
            if IS_SORT and image_count > 1:
                destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_file_path, account_name)
                if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                    log.step(account_name + " 图片从下载目录移动到保存目录成功")
                else:
                    log.error(account_name + " 创建图片子目录 %s 失败" % destination_path)
                    tool.process_exit()

            # 新的存档记录
            if first_album_id != "0":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = first_album_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #34

파일 보기

파일: meipai.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_VIDEO_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 4 and self.account_info[3]:
            account_name = self.account_info[3]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                video_path = os.path.join(VIDEO_TEMP_PATH, account_name)
            else:
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)

            page_count = 1
            video_count = 1
            first_video_id = "0"
            unique_list = []
            is_over = False
            need_make_download_dir = True
            while not is_over:
                # 获取指定一页的视频信息
                medias_data = get_one_page_video_data(account_id, page_count)
                if medias_data is None:
                    log.error(account_name + " 视频列表获取失败")
                    tool.process_exit()

                for media in medias_data:
                    if not robot.check_sub_key(("video", "id"), media):
                        log.error(account_name + " 第%s个视频信：%s解析失败" % (video_count, media))
                        continue

                    video_id = str(media["id"])

                    # 检查是否图片时间小于上次的记录
                    if int(video_id) <= int(self.account_info[2]):
                        is_over = True
                        break

                    # 新增视频导致的重复判断
                    if video_id in unique_list:
                        continue
                    else:
                        unique_list.append(video_id)
                    # 将第一张图片的上传时间做为新的存档记录
                    if first_video_id == "0":
                        first_video_id = video_id

                    video_url = str(media["video"])
                    log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_url))

                    # 第一个视频，创建目录
                    if need_make_download_dir:
                        if not tool.make_dir(video_path, 0):
                            log.error(account_name + " 创建视频下载目录 %s 失败" % video_path)
                            tool.process_exit()
                        need_make_download_dir = False
                        
                    file_path = os.path.join(video_path, "%04d.mp4" % video_count)
                    if tool.save_net_file(video_url, file_path):
                        log.step(account_name + " 第%s个视频下载成功" % video_count)
                        video_count += 1
                    else:
                        log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url))

                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_VIDEO_COUNT < video_count:
                        is_over = True
                        break

                if not is_over:
                    if len(medias_data) >= VIDEO_COUNT_PER_PAGE:
                        page_count += 1
                    else:
                        # 获取的数量小于请求的数量，已经没有剩余视频了
                        is_over = True

            log.step(account_name + " 下载完毕，总共获得%s个视频" % (video_count - 1))

            # 排序
            if IS_SORT and video_count > 1:
                destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)
                if robot.sort_file(video_path, destination_path, int(self.account_info[1]), 4):
                    log.step(account_name + " 视频从下载目录移动到保存目录成功")
                else:
                    log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path)
                    tool.process_exit()

            # 新的存档记录
            if first_video_id != "":
                self.account_info[1] = str(int(self.account_info[1]) + video_count - 1)
                self.account_info[2] = first_video_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #35

파일 보기

파일: zunguang.py 프로젝트: yxw19870806/yxw19870806

    def main(self):
        # 解析存档文件，获取上一次的album id
        page_count = 1
        if os.path.exists(self.save_data_path):
            save_file = open(self.save_data_path, "r")
            save_info = save_file.read()
            save_file.close()
            page_count = int(save_info.strip())

        total_image_count = 0
        error_count = 0
        is_over = False
        while not is_over:
            album_status, album_data = get_one_page_album_data(page_count)

            if album_status == -1:
                log.error("第%s页相册获取失败" % page_count)
                break
            elif album_status == -2:
                log.error("第%s页相册解析失败" % page_count)
                break
            elif album_status == 2:
                error_count += 1
                if error_count >= ERROR_PAGE_COUNT_CHECK:
                    log.error("连续%s页相册没有图片，退出程序" % ERROR_PAGE_COUNT_CHECK)
                    page_count -= error_count - 1
                    break
                else:
                    log.error("第%s页相册已被删除" % page_count)
                    page_count += 1
                    continue
            elif album_status == 3:
                log.error("第%s页歌曲相册" % page_count)
                page_count += 1
                continue
            elif album_status == 4:
                log.error("第%s页相册未知相册类型%s" % (page_count, album_data))
                break
            # 错误数量重置
            error_count = 0

            # 下载目录标题
            title = ""
            if album_data["title"]:
                # 过滤标题中不支持的字符
                title = robot.filter_text(str(album_data["title"].encode("utf-8")))
            if title:
                image_path = os.path.join(self.image_download_path, "%04d %s" % (page_count, title))
            else:
                image_path = os.path.join(self.image_download_path, "%04d" % page_count)
            if not tool.make_dir(image_path, 0):
                # 目录出错，把title去掉后再试一次，如果还不行退出
                log.error("第%s页创建相册目录 %s 失败，尝试不使用title" % (page_count, image_path))
                post_path = os.path.join(image_path, page_count)
                if not tool.make_dir(post_path, 0):
                    log.error("第%s页创建相册目录 %s 失败" % (page_count, image_path))
                    tool.process_exit()

            image_count = 1
            for image_data in album_data["attr"]["img"]:
                image_url = "http://www.zunguang.com/%s" % str(image_data["url"])
                log.step("开始下载第%s页第%s张图片 %s" % (page_count, image_count, image_url))

                file_type = image_url.split(".")[-1]
                file_path = os.path.join(image_path, "%03d.%s" % (image_count, file_type))
                try:
                    if tool.save_net_file(image_url, file_path, True):
                        log.step("第%s页第%s张图片下载成功" % (page_count, image_count))
                        image_count += 1
                    else:
                        log.error("第%s页第%s张图片 %s 下载失败" % (page_count, image_count, image_url))
                except SystemExit:
                    log.step("提前退出")
                    tool.remove_dir(image_path)
                    is_over = True
                    break

            if not is_over:
                total_image_count += image_count - 1
                page_count += 1

        # 重新保存存档文件
        save_data_dir = os.path.dirname(self.save_data_path)
        if not os.path.exists(save_data_dir):
            tool.make_dir(save_data_dir, 0)
        save_file = open(self.save_data_path, "w")
        save_file.write(str(page_count))
        save_file.close()

        log.step("全部下载完毕，耗时%s秒，共计图片%s张" % (self.get_run_time(), total_image_count))

예제 #36

파일 보기

파일: net.py 프로젝트: numbaby/PyGooglePlusCrawler

def http_request(url, method="GET", post_data=None, binary_data=None, header_list=None, cookies_list=None, connection_timeout=HTTP_CONNECTION_TIMEOUT,
                 read_timeout=HTTP_CONNECTION_TIMEOUT, is_random_ip=True, json_decode=False, encode_multipart=False, redirect=True, exception_return=""):
    if not (url.find("http://") == 0 or url.find("https://") == 0):
        return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID)
    method = method.upper()
    if method not in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]:
        return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID)
    if HTTP_CONNECTION_POOL is None:
        init_http_connection_pool()

    retry_count = 0
    while True:
        while process.PROCESS_STATUS == process.PROCESS_STATUS_PAUSE:
            time.sleep(10)
        if process.PROCESS_STATUS == process.PROCESS_STATUS_STOP:
            tool.process_exit(0)

        if header_list is None:
            header_list = {}

        # 设置User-Agent
        if "User-Agent" not in header_list:
            header_list["User-Agent"] = _random_user_agent()

        # 设置一个随机IP
        if is_random_ip:
            random_ip = _random_ip_address()
            header_list["X-Forwarded-For"] = random_ip
            header_list["X-Real-Ip"] = random_ip

        # 设置cookie
        if cookies_list:
            header_list["Cookie"] = build_header_cookie_string(cookies_list)

        try:
            if connection_timeout == 0 and read_timeout == 0:
                timeout = None
            elif connection_timeout == 0:
                timeout = urllib3.Timeout(read=read_timeout)
            elif read_timeout == 0:
                timeout = urllib3.Timeout(connect=connection_timeout)
            else:
                timeout = urllib3.Timeout(connect=connection_timeout, read=read_timeout)
            if method == "POST":
                if binary_data is None:
                    response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout, fields=post_data, encode_multipart=encode_multipart)
                else:
                    response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout, body=binary_data, encode_multipart=encode_multipart)
            else:
                response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout)
            if response.status == HTTP_RETURN_CODE_SUCCEED and json_decode:
                try:
                    response.json_data = json.loads(response.data)
                except ValueError:
                    is_error = True
                    content_type = response.getheader("Content-Type")
                    if content_type is not None:
                        charset = tool.find_sub_string(content_type, "charset=", None)
                        if charset:
                            if charset == "gb2312":
                                charset = "GBK"
                            try:
                                response.json_data = json.loads(response.data.decode(charset))
                            except:
                                pass
                            else:
                                is_error = False
                    if is_error:
                        response.status = HTTP_RETURN_CODE_JSON_DECODE_ERROR
            return response
        except urllib3.exceptions.ProxyError:
            notice = "无法访问代理服务器，请检查代理设置。检查完成后输入(C)ontinue继续程序或者(S)top退出程序："
            input_str = tool.console_input(notice).lower()
            if input_str in ["c", "continue"]:
                pass
            elif input_str in ["s", "stop"]:
                tool.process_exit(0)
        except urllib3.exceptions.ReadTimeoutError:
            pass
        except urllib3.exceptions.ConnectTimeoutError, e:
            # 域名无法解析
            if str(e).find("[Errno 11004] getaddrinfo failed") >= 0:
                return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED)
            pass
        # except urllib3.exceptions.MaxRetryError, e:
        #     print_msg(url)
        #     print_msg(str(e))
        #     # 无限重定向
        #     # if str(e).find("Caused by ResponseError('too many redirects',)") >= 0:
        #     #     return ErrorResponse(-1)
        # except urllib3.exceptions.ConnectTimeoutError, e:
        #     print_msg(str(e))
        #     print_msg(url + " 访问超时，稍后重试")
        #     # 域名无法解析
        #     # if str(e).find("[Errno 11004] getaddrinfo failed") >= 0:
        #     #     return ErrorResponse(-2)
        # except urllib3.exceptions.ProtocolError, e:
        #     print_msg(str(e))
        #     print_msg(url + " 访问超时，稍后重试")
        #     # 链接被终止
        #     # if str(e).find("'Connection aborted.', error(10054,") >= 0:
        #     #     return ErrorResponse(-3)
        except Exception, e:
            if exception_return and str(e).find(exception_return) >= 0:
                return ErrorResponse(HTTP_RETURN_CODE_EXCEPTION_CATCH)
            elif str(e).find("EOF occurred in violation of protocol") >=0:
                time.sleep(30)
            tool.print_msg(str(e))
            tool.print_msg(url + " 访问超时，稍后重试")
            traceback.print_exc()

예제 #37

파일 보기

파일: tuchong.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT

        account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            if account_name.isdigit():
                site_id = account_name
            else:
                site_id = get_site_id(account_name)
            if site_id is None:
                log.error(account_name + " 主页无法访问")
                tool.process_exit()

            if not site_id:
                log.error(account_name + " site id解析失败")
                tool.process_exit()

            image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)

            this_account_total_image_count = 0
            post_count = 0
            first_post_id = "0"
            post_time = "2016-11-16 14:12:00"
            is_over = False
            while not is_over:
                # 获取一页的相册信息列表
                post_info_list = get_one_page_post_info_list(site_id, post_time)
                if post_info_list is None:
                    log.error(account_name + " 相册信息列表无法访问")
                    tool.process_exit()

                # 如果为空，表示已经取完了
                if len(post_info_list) == 0:
                    break

                for post_info in post_info_list:
                    if not robot.check_sub_key(("title", "post_id", "published_at", "images"), post_info):
                        log.error(account_name + " 相册信息解析失败：%s" % post_info)
                        continue

                    post_id = str(post_info["post_id"])

                    # 检查信息页id是否小于上次的记录
                    if int(post_id) <= int(self.account_info[1]):
                        is_over = True
                        break

                    # 将第一个信息页的id做为新的存档记录
                    if first_post_id == "0":
                        first_post_id = post_id

                    # 过滤标题中不支持的字符
                    title = robot.filter_text(post_info["title"])
                    if title:
                        post_path = os.path.join(image_path, "%s %s" % (post_id, title))
                    else:
                        post_path = os.path.join(image_path, post_id)
                    if not tool.make_dir(post_path, 0):
                        # 目录出错，把title去掉后再试一次，如果还不行退出
                        log.error(account_name + " 创建相册目录 %s 失败，尝试不使用title" % post_path)
                        post_path = os.path.join(image_path, post_id)
                        if not tool.make_dir(post_path, 0):
                            log.error(account_name + " 创建相册目录 %s 失败" % post_path)
                            tool.process_exit()

                    image_count = 0
                    for image_info in post_info["images"]:
                        image_count += 1
                        if not robot.check_sub_key(("img_id",), image_info):
                            log.error(account_name + " 相册%s 第%s张图片解析失败" % (post_id, image_count))
                            continue
                        image_url = generate_large_image_url(site_id, image_info["img_id"])
                        log.step(account_name + " 相册%s 开始下载第%s张图片 %s" % (post_id, image_count, image_url))

                        file_path = os.path.join(post_path, "%s.jpg" % image_count)
                        if tool.save_net_file(image_url, file_path):
                            log.step(account_name + " 相册%s 第%s张图片下载成功" % (post_id, image_count))
                        else:
                            log.error(account_name + " 相册%s 第%s张图片 %s 下载失败" % (post_info["post_id"], image_count, image_url))
                    this_account_total_image_count += image_count

                    if not is_over:
                        # 达到配置文件中的下载页数，结束
                        if 0 < GET_PAGE_COUNT < post_count:
                            is_over = True
                        else:
                            # 相册发布时间
                            post_time = post_info["published_at"]
                            post_count += 1

            log.step(account_name + " 下载完毕，总共获得%s张图片" % this_account_total_image_count)

            # 新的存档记录
            if first_post_id != "0":
                self.account_info[1] = first_post_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += this_account_total_image_count
            ACCOUNTS.remove(account_name)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #38

파일 보기

파일: diary.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 4 and self.account_info[3]:
            account_name = self.account_info[3]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)

            image_count = 1
            page_count = 1
            first_diary_id = "0"
            is_over = False
            need_make_image_dir = True
            while not is_over:
                # 获取一页博客信息
                diary_list = get_one_page_diary_data(account_id, page_count)
                if diary_list is None:
                    log.error(account_name + " 第%s页日志列表解析异常" % page_count)
                    tool.process_exit()

                # 没有获取到任何日志，所有日志已经全部获取完毕了
                if len(diary_list) == 0:
                    break

                for diary_info in list(diary_list):
                    # 日志id
                    diary_id = tool.find_sub_string(diary_info, "id=", "&")
                    if not diary_id:
                        log.error(account_name + " 日志id解析异常，日志信息：%s" % diary_info)
                        continue

                    # 检查是否是上一次的最后视频
                    if int(diary_id) <= int(self.account_info[2]):
                        is_over = True
                        break

                    # 将第一个日志的id做为新的存档记录
                    if first_diary_id == "0":
                        first_diary_id = diary_id

                    log.trace(account_name + " 日志id %s" % diary_id)

                    # 获取这个日志中的全部图片地址列表
                    image_url_list = get_image_url_list(diary_info)
                    for image_url in image_url_list:
                        # 如果图片地址没有域名，表示直接使用当前域名下的资源，需要拼接成完整的地址
                        if image_url[:7] != "http://" and image_url[:8] != "https://":
                            if image_url[0] == "/":
                                image_url = "http://www.keyakizaka46.com%s" % image_url
                            else:
                                image_url = "http://www.keyakizaka46.com/%s" % image_url

                        log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url))

                        # 第一张图片，创建目录
                        if need_make_image_dir:
                            if not tool.make_dir(image_path, 0):
                                log.error(account_name + " 创建图片下载目录 %s 失败" % image_path)
                                tool.process_exit()
                            need_make_image_dir = False

                        file_type = image_url.split(".")[-1]
                        file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                        if tool.save_net_file(image_url, file_path):
                            log.step(account_name + " 第%s张图片下载成功" % image_count)
                            image_count += 1
                        else:
                            log.error(account_name + " 第%s张图片 %s 获取失败" % (image_count, image_url))

                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_IMAGE_COUNT < image_count:
                        is_over = True
                        break

                if not is_over:
                    # 达到配置文件中的下载页数，结束
                    if 0 < GET_PAGE_COUNT <= page_count:
                        is_over = True
                    else:
                        page_count += 1

            log.step(account_name + " 下载完毕，总共获得%s张图片" % (image_count - 1))

            # 排序
            if IS_SORT and image_count > 1:
                destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                    log.step(account_name + " 图片从下载目录移动到保存目录成功")
                else:
                    log.error(account_name + " 创建图片子目录 %s 失败" % destination_path)
                    tool.process_exit()

            # 新的存档记录
            if first_diary_id != "0":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = first_diary_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #39

파일 보기

파일: bcy.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT

        coser_id = self.account_info[0]
        if len(self.account_info) >= 3:
            cn = self.account_info[2]
        else:
            cn = self.account_info[0]

        try:
            log.step(cn + " 开始")

            image_path = os.path.join(IMAGE_DOWNLOAD_PATH, cn)

            # 图片下载
            this_cn_total_image_count = 0
            page_count = 1
            total_rp_count = 1
            first_rp_id = ""
            unique_list = []
            is_over = False
            need_make_download_dir = True  # 是否需要创建cn目录
            while not is_over:
                # 获取一页的作品信息
                post_page = get_one_page_post(coser_id, page_count)
                if post_page is None:
                    log.error(cn + " 无法访问第%s页作品" % page_count)
                    tool.process_exit()

                # 解析作品信息，获取所有的正片信息
                cp_id, rp_list = get_rp_list(post_page)
                if cp_id is None:
                    log.error(cn + " 第%s页作品解析异常" % page_count)
                    tool.process_exit()

                for rp_id, title in rp_list.iteritems():
                    # 检查是否已下载到前一次的图片
                    if int(rp_id) <= int(self.account_info[1]):
                        is_over = True
                        break

                    # 新增正片导致的重复判断
                    if rp_id in unique_list:
                        continue
                    else:
                        unique_list.append(rp_id)
                    # 将第一个作品的id做为新的存档记录
                    if first_rp_id == "":
                        first_rp_id = rp_id

                    log.trace("rp: " + rp_id)

                    if need_make_download_dir:
                        if not tool.make_dir(image_path, 0):
                            log.error(cn + " 创建CN目录 %s 失败" % image_path)
                            tool.process_exit()
                        need_make_download_dir = False

                    # 过滤标题中不支持的字符
                    title = robot.filter_text(title)
                    if title:
                        rp_path = os.path.join(image_path, "%s %s" % (rp_id, title))
                    else:
                        rp_path = os.path.join(image_path, rp_id)
                    if not tool.make_dir(rp_path, 0):
                        # 目录出错，把title去掉后再试一次，如果还不行退出
                        log.error(cn + " 创建作品目录 %s 失败，尝试不使用title" % rp_path)
                        rp_path = os.path.join(image_path, rp_id)
                        if not tool.make_dir(rp_path, 0):
                            log.error(cn + " 创建作品目录 %s 失败" % rp_path)
                            tool.process_exit()

                    # 获取正片页面内的所有图片地址列表
                    image_url_list = get_image_url_list(cp_id, rp_id)
                    if image_url_list is None:
                        log.error(cn + " 无法访问正片：%s，cp_id：%s" % (rp_id, cp_id))
                        continue

                    if len(image_url_list) == 0 and IS_AUTO_FOLLOW:
                        log.step(cn + " 检测到可能有私密作品且账号不是ta的粉丝，自动关注")
                        if follow(coser_id):
                            # 重新获取下正片页面内的所有图片地址列表
                            image_url_list = get_image_url_list(cp_id, rp_id)

                    if len(image_url_list) == 0:
                        log.error(cn + " 正片：%s没有任何图片，可能是你使用的账号没有关注ta，所以无法访问只对粉丝开放的私密作品，cp_id：%s" % (rp_id, cp_id))
                        continue

                    image_count = 1
                    for image_url in list(image_url_list):
                        # 禁用指定分辨率
                        image_url = "/".join(image_url.split("/")[0:-1])
                        log.step(cn + " %s 开始下载第%s张图片 %s" % (rp_id, image_count, image_url))

                        if image_url.rfind("/") < image_url.rfind("."):
                            file_type = image_url.split(".")[-1]
                        else:
                            file_type = "jpg"
                        file_path = os.path.join(rp_path, "%03d.%s" % (image_count, file_type))
                        if tool.save_net_file(image_url, file_path):
                            image_count += 1
                            log.step(cn + " %s 第%s张图片下载成功" % (rp_id, image_count))
                        else:
                            log.error(cn + " %s 第%s张图片 %s 下载失败" % (rp_id, image_count, image_url))

                    this_cn_total_image_count += image_count - 1

                    if 0 < GET_PAGE_COUNT < total_rp_count:
                        is_over = True
                        break
                    else:
                        total_rp_count += 1

                if not is_over:
                    if page_count >= get_max_page_count(coser_id, post_page):
                        is_over = True
                    else:
                        page_count += 1

            log.step(cn + " 下载完毕，总共获得%s张图片" % this_cn_total_image_count)

            # 新的存档记录
            if first_rp_id != "":
                self.account_info[1] = first_rp_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += this_cn_total_image_count
            ACCOUNTS.remove(coser_id)
            self.thread_lock.release()

            log.step(cn + " 完成")
        except SystemExit:
            log.error(cn + " 异常退出")
        except Exception, e:
            log.error(cn + " 未知异常")
            log.error(str(e) + "\n" + str(traceback.format_exc()))

예제 #40

파일 보기

파일: crawler.py 프로젝트: yxw19870806/Py3CrawlerLib

    def __init__(self, sys_config, **kwargs):
        self.start_time = time.time()

        # 程序启动配置
        if not isinstance(sys_config, dict):
            output.print_msg("程序启动配置不存在，请检查代码！")
            tool.process_exit()
            return
        # 额外初始化配置（直接通过实例化中传入，可覆盖子类__init__方法传递的sys_config参数）
        if "extra_sys_config" in kwargs and isinstance(
                kwargs["extra_sys_config"], dict):
            sys_config.update(kwargs["extra_sys_config"])
        sys_download_photo = SYS_DOWNLOAD_PHOTO in sys_config
        sys_download_video = SYS_DOWNLOAD_VIDEO in sys_config
        sys_download_audio = SYS_DOWNLOAD_AUDIO in sys_config
        sys_set_proxy = SYS_SET_PROXY in sys_config
        sys_get_cookie = SYS_GET_COOKIE in sys_config
        sys_not_check_save_data = SYS_NOT_CHECK_SAVE_DATA in sys_config
        sys_not_download = SYS_NOT_DOWNLOAD in sys_config

        # exe程序
        if tool.IS_EXECUTABLE:
            application_path = os.path.dirname(sys.executable)
            os.chdir(application_path)
            config_path = os.path.join(os.getcwd(), "data/config.ini")
        else:
            config_path = PROJECT_CONFIG_PATH

        # 程序配置
        config = read_config(config_path)
        # 应用配置
        app_config_path = os.path.abspath(
            os.path.join(PROJECT_APP_PATH, "app.ini"))
        if os.path.exists(app_config_path):
            config.update(read_config(app_config_path))
        # 额外应用配置（直接通过实例化中传入，可覆盖配置文件中参数）
        if "extra_app_config" in kwargs and isinstance(
                kwargs["extra_app_config"], dict):
            config.update(kwargs["extra_app_config"])

        # 应用配置
        self.app_config = {}
        if SYS_APP_CONFIG in sys_config and len(
                sys_config[SYS_APP_CONFIG]) > 0:
            for app_config_template in sys_config[SYS_APP_CONFIG]:
                if len(app_config_template) == 3:
                    self.app_config[app_config_template[0]] = analysis_config(
                        config, app_config_template[0], app_config_template[1],
                        app_config_template[2])

        # 是否下载
        self.is_download_photo = analysis_config(
            config, "IS_DOWNLOAD_PHOTO", True,
            CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_photo
        self.is_download_video = analysis_config(
            config, "IS_DOWNLOAD_VIDEO", True,
            CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_video
        self.is_download_audio = analysis_config(
            config, "IS_DOWNLOAD_AUDIO", True,
            CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_audio

        if not sys_not_download and not self.is_download_photo and not self.is_download_video and not self.is_download_audio:
            if sys_download_photo or sys_download_video or sys_download_audio:
                output.print_msg("所有支持的下载都没有开启，请检查配置！")
                tool.process_exit()
                return

        # 存档
        self.save_data_path = analysis_config(config, "SAVE_DATA_PATH",
                                              "\\\\info/save.data",
                                              CONFIG_ANALYSIS_MODE_PATH)
        if not sys_not_check_save_data and not os.path.exists(
                self.save_data_path):
            # 存档文件不存在
            output.print_msg("存档文件%s不存在！" % self.save_data_path)
            tool.process_exit()
            return
        temp_file_name = time.strftime(
            "%m-%d_%H_%M_", time.localtime(time.time())) + os.path.basename(
                self.save_data_path)
        self.temp_save_data_path = os.path.join(
            os.path.dirname(self.save_data_path), temp_file_name)
        if os.path.exists(self.temp_save_data_path):
            # 临时文件已存在
            output.print_msg("存档临时文件%s已存在！" % self.temp_save_data_path)
            tool.process_exit()
            return

        # cache
        self.cache_data_path = analysis_config(config, "CACHE_DATA_PATH",
                                               "\\\\cache",
                                               CONFIG_ANALYSIS_MODE_PATH)

        # session
        self.session_data_path = analysis_config(config, "SESSION_DATA_PATH",
                                                 "\\\\info/session.data",
                                                 CONFIG_ANALYSIS_MODE_PATH)

        # 是否需要下载图片
        if self.is_download_photo:
            # 图片保存目录
            self.photo_download_path = analysis_config(
                config, "PHOTO_DOWNLOAD_PATH", "\\\\photo",
                CONFIG_ANALYSIS_MODE_PATH)
        else:
            self.photo_download_path = ""
        # 是否需要下载视频
        if self.is_download_video:
            # 视频保存目录
            self.video_download_path = analysis_config(
                config, "VIDEO_DOWNLOAD_PATH", "\\\\video",
                CONFIG_ANALYSIS_MODE_PATH)
        else:
            self.video_download_path = ""
        # 是否需要下载音频
        if self.is_download_audio:
            # 音频保存目录
            self.audio_download_path = analysis_config(
                config, "AUDIO_DOWNLOAD_PATH", "\\\\audio",
                CONFIG_ANALYSIS_MODE_PATH)
        else:
            self.audio_download_path = ""

        # 代理
        is_proxy = analysis_config(config, "IS_PROXY", 2,
                                   CONFIG_ANALYSIS_MODE_INTEGER)
        if is_proxy == 1 or (is_proxy == 2 and sys_set_proxy):
            proxy_ip = analysis_config(config, "PROXY_IP", "127.0.0.1")
            proxy_port = analysis_config(config, "PROXY_PORT", "8087")
            # 使用代理的线程池
            net.set_proxy(proxy_ip, proxy_port)
        else:
            # 初始化urllib3的线程池
            net.init_http_connection_pool()

        # cookies
        self.cookie_value = {}
        if sys_get_cookie:
            # 操作系统&浏览器
            browser_type = analysis_config(config, "BROWSER_TYPE", 2,
                                           CONFIG_ANALYSIS_MODE_INTEGER)
            # cookie
            is_auto_get_cookie = analysis_config(config, "IS_AUTO_GET_COOKIE",
                                                 True,
                                                 CONFIG_ANALYSIS_MODE_BOOLEAN)
            if is_auto_get_cookie:
                cookie_path = browser.get_default_browser_cookie_path(
                    browser_type)
            else:
                cookie_path = analysis_config(config, "COOKIE_PATH", "")
            all_cookie_from_browser = browser.get_all_cookie_from_browser(
                browser_type, cookie_path)
            for cookie_domain in sys_config[SYS_GET_COOKIE]:
                check_domain_list = [cookie_domain]
                if cookie_domain[0] != ".":
                    check_domain_list.append("." + cookie_domain)
                elif cookie_domain[0] == ".":
                    check_domain_list.append(cookie_domain[1:])
                for check_domain in check_domain_list:
                    if check_domain in all_cookie_from_browser:
                        self.cookie_value.update(
                            all_cookie_from_browser[check_domain])

        # 线程数
        self.thread_count = analysis_config(config, "THREAD_COUNT", 10,
                                            CONFIG_ANALYSIS_MODE_INTEGER)
        self.thread_lock = threading.Lock()  # 线程锁，避免操作一些全局参数
        self.thread_semaphore = threading.Semaphore(
            self.thread_count)  # 线程总数信号量

        # 启用线程监控是否需要暂停其他下载线程
        if analysis_config(config, "IS_PORT_LISTENER_ENVET", True,
                           CONFIG_ANALYSIS_MODE_BOOLEAN):
            listener_event_bind = {}
            # 暂停进程
            listener_event_bind[str(
                portListenerEvent.PROCESS_STATUS_PAUSE)] = net.pause_request
            # 继续进程
            listener_event_bind[str(
                portListenerEvent.PROCESS_STATUS_RUN)] = net.resume_request
            # 结束进程（取消当前的线程，完成任务）
            listener_event_bind[str(
                portListenerEvent.PROCESS_STATUS_STOP)] = self.stop_process

            listener_port = analysis_config(config, "LISTENER_PORT", 12345,
                                            CONFIG_ANALYSIS_MODE_INTEGER)
            process_control_thread = portListenerEvent.PortListenerEvent(
                port=listener_port, event_list=listener_event_bind)
            process_control_thread.setDaemon(True)
            process_control_thread.start()

        # 键盘监控线程（仅支持windows）
        if platform.system() == "Windows" and analysis_config(
                config, "IS_KEYBOARD_EVENT", True,
                CONFIG_ANALYSIS_MODE_BOOLEAN):
            keyboard_event_bind = {}
            pause_process_key = analysis_config(config,
                                                "PAUSE_PROCESS_KEYBOARD_KEY",
                                                "F9")
            # 暂停进程
            if pause_process_key:
                keyboard_event_bind[pause_process_key] = self.pause_process
            # 继续进程
            continue_process_key = analysis_config(
                config, "CONTINUE_PROCESS_KEYBOARD_KEY", "F10")
            if continue_process_key:
                keyboard_event_bind[continue_process_key] = self.resume_process
            # 结束进程（取消当前的线程，完成任务）
            stop_process_key = analysis_config(config,
                                               "STOP_PROCESS_KEYBOARD_KEY",
                                               "CTRL + F12")
            if stop_process_key:
                keyboard_event_bind[stop_process_key] = self.stop_process

            if keyboard_event_bind:
                keyboard_control_thread = keyboardEvent.KeyboardEvent(
                    keyboard_event_bind)
                keyboard_control_thread.setDaemon(True)
                keyboard_control_thread.start()

        self.total_photo_count = 0
        self.total_video_count = 0
        self.total_audio_count = 0

        output.print_msg("初始化完成")

예제 #41

파일 보기

파일: ameblo.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT

        account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)

            image_count = 1
            page_count = 1
            first_blog_time = "0"
            is_over = False
            need_make_image_dir = True
            while not is_over:
                # 获取一页日志
                blog_data = get_blog_page_data(account_name, page_count)
                if blog_data is None:
                    log.error(account_name + " 第%s页日志无法获取" % page_count)
                    tool.process_exit()

                # 解析日志发布时间
                blog_time = get_blog_time(blog_data)
                if blog_time is None:
                    log.error(account_name + " 第%s页解析日志时间失败" % page_count)
                    tool.process_exit()

                # 检查是否是上一次的最后blog
                if blog_time <= int(self.account_info[2]):
                    break

                # 将第一个日志的时间做为新的存档记录
                if first_blog_time == "0":
                    first_blog_time = str(blog_time)

                # 从日志列表中获取全部的图片
                image_url_list = get_image_url_list(blog_data)
                for image_url in image_url_list:
                    # 使用默认图片的分辨率
                    image_url = image_url.split("?")[0]
                    # 过滤表情
                    if image_url.find("http://emoji.ameba.jp") >= 0:
                        continue
                    log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url))

                    # 第一张图片，创建目录
                    if need_make_image_dir:
                        if not tool.make_dir(image_path, 0):
                            log.error(account_name + " 创建图片下载目录 %s 失败" % image_path)
                            tool.process_exit()
                        need_make_image_dir = False
                        
                    file_type = image_url.split(".")[-1]
                    file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                    if tool.save_net_file(image_url, file_path):
                        log.step(account_name + " 第%s张图片下载成功" % image_count)
                        image_count += 1
                    else:
                        log.error(account_name + " 第%s张图片 %s 获取失败" % (image_count, image_url))

                # 达到配置文件中的下载数量，结束
                if 0 < GET_IMAGE_COUNT < image_count:
                    is_over = True

                if not is_over:
                    if 0 < GET_PAGE_COUNT < page_count:
                        is_over = True
                    else:
                        page_count += 1

            log.step(account_name + " 下载完毕，总共获得%s张图片" % (image_count - 1))

            # 排序
            if IS_SORT and image_count > 1:
                destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                    log.step(account_name + " 图片从下载目录移动到保存目录成功")
                else:
                    log.error(account_name + " 创建图片子目录 %s 失败" % destination_path)
                    tool.process_exit()

            # 新的存档记录
            if first_blog_time != "0":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = first_blog_time

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            ACCOUNTS.remove(account_name)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #42

파일 보기

 def main_thread_check(self):
     if not self.main_thread.is_running():
         self.notify_main_thread()
         tool.process_exit(0)

예제 #43

파일 보기

파일: nanaGoGo.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT
        global TOTAL_VIDEO_COUNT

        account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
                video_path = os.path.join(VIDEO_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)

            image_count = 1
            video_count = 1
            target_id = INIT_TARGET_ID
            first_post_id = "0"
            is_over = False
            need_make_image_dir = True
            need_make_video_dir = True

            while not is_over:
                # 获取一页日志信息
                message_page_data = get_message_page_data(account_name, target_id)
                if message_page_data is None:
                    log.error(account_name + " 媒体列表解析异常")
                    tool.process_exit()
                # 没有了
                if len(message_page_data) == 0:
                    break

                for message_info in message_page_data:
                    if not robot.check_sub_key(("post",), message_info):
                        log.error(account_name + " 媒体信息解析异常 %s" % message_info)
                        continue
                    if not robot.check_sub_key(("body", "postId"), message_info["post"]):
                        log.error(account_name + " 媒体信息解析异常 %s" % message_info)
                        continue

                    target_id = message_info["post"]["postId"]
                    # 检查是否已下载到前一次的记录
                    if int(target_id) <= int(self.account_info[3]):
                        is_over = True
                        break

                    # 将第一个媒体的postId做为新的存档记录
                    if first_post_id == "0":
                        first_post_id = str(target_id)

                    for media_info in message_info["post"]["body"]:
                        if not robot.check_sub_key(("bodyType",), media_info):
                            log.error(account_name + " 媒体列表bodyType解析异常")
                            continue

                        # bodyType = 1: text, bodyType = 3: image, bodyType = 8: video
                        body_type = int(media_info["bodyType"])
                        if body_type == 1:  # 文本
                            pass
                        elif body_type == 2:  # 表情
                            pass
                        elif body_type == 3:  # 图片
                            if IS_DOWNLOAD_IMAGE:
                                if not robot.check_sub_key(("image",), media_info):
                                    log.error(account_name + " 第%s张图片解析异常%s" % (image_count, media_info))
                                    continue

                                image_url = str(media_info["image"])
                                log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url))

                                # 第一张图片，创建目录
                                if need_make_image_dir:
                                    if not tool.make_dir(image_path, 0):
                                        log.error(account_name + " 创建图片下载目录 %s 失败" % image_path)
                                        tool.process_exit()
                                    need_make_image_dir = False

                                file_type = image_url.split(".")[-1]
                                image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                                if tool.save_net_file(image_url, image_file_path):
                                    log.step(account_name + " 第%s张图片下载成功" % image_count)
                                    image_count += 1
                                else:
                                    log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url))
                        elif body_type == 8:  # video
                            if IS_DOWNLOAD_VIDEO:
                                if not robot.check_sub_key(("movieUrlHq",), media_info):
                                    log.error(account_name + " 第%s个视频解析异常%s" % (video_count, media_info))
                                    continue

                                video_url = str(media_info["movieUrlHq"])
                                log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_url))

                                # 第一个视频，创建目录
                                if need_make_video_dir:
                                    if not tool.make_dir(video_path, 0):
                                        log.error(account_name + " 创建视频下载目录 %s 失败" % video_path)
                                        tool.process_exit()
                                    need_make_video_dir = False

                                file_type = video_url.split(".")[-1]
                                video_file_path = os.path.join(video_path, "%04d.%s" % (video_count, file_type))
                                if tool.save_net_file(video_url, video_file_path):
                                    log.step(account_name + " 第%s个视频下载成功" % video_count)
                                    video_count += 1
                                else:
                                    log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url))
                        elif body_type == 7:  # 转发
                            pass
                        else:
                            log.error(account_name + " 第%s张图片、第%s个视频，未知bodytype %s, %s" % (image_count, video_count, body_type, media_info))

            # 排序
            if IS_SORT:
                if image_count > 1:
                    destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                        log.step(account_name + " 图片从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path)
                        tool.process_exit()
                if video_count > 1:
                    destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(video_path, destination_path, int(self.account_info[2]), 4):
                        log.step(account_name + " 视频从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            # 新的存档记录
            if first_post_id != "0":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = str(int(self.account_info[2]) + video_count - 1)
                self.account_info[3] = first_post_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_name)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #44

파일 보기

    def __init__(self, sys_config, extra_config=None):
        self.start_time = time.time()

        # 程序启动配置
        if not isinstance(sys_config, dict):
            self.print_msg("程序启动配置不存在，请检查代码！")
            tool.process_exit()
            return
        sys_download_image = SYS_DOWNLOAD_IMAGE in sys_config
        sys_download_video = SYS_DOWNLOAD_VIDEO in sys_config
        sys_set_proxy = SYS_SET_PROXY in sys_config
        sys_get_cookie = SYS_GET_COOKIE in sys_config
        sys_not_check_save_data = SYS_NOT_CHECK_SAVE_DATA in sys_config
        sys_not_download = SYS_NOT_DOWNLOAD in sys_config

        # exe程序
        if tool.IS_EXECUTABLE:
            application_path = os.path.dirname(sys.executable)
            os.chdir(application_path)
            config_path = os.path.join(os.getcwd(), "data/config.ini")
        else:
            config_path = tool.PROJECT_CONFIG_PATH

        # 程序配置
        config = read_config(config_path)
        # 应用配置
        app_config_path = os.path.abspath(
            os.path.join(tool.PROJECT_APP_PATH, "app.ini"))
        if os.path.exists(app_config_path):
            config.update(read_config(app_config_path))
        # 额外配置
        if isinstance(extra_config, dict):
            config.update(extra_config)

        # 应用配置
        self.app_config = {}
        if SYS_APP_CONFIG in sys_config and len(
                sys_config[SYS_APP_CONFIG]) > 0:
            for app_config_template in sys_config[SYS_APP_CONFIG]:
                if len(app_config_template) == 3:
                    self.app_config[app_config_template[0]] = analysis_config(
                        config, app_config_template[0], app_config_template[1],
                        app_config_template[2])

        # 日志
        log.IS_SHOW_ERROR = self.is_show_error = analysis_config(
            config, "IS_SHOW_ERROR", True, CONFIG_ANALYSIS_MODE_BOOLEAN)
        log.IS_SHOW_STEP = self.is_show_step = analysis_config(
            config, "IS_SHOW_STEP", True, CONFIG_ANALYSIS_MODE_BOOLEAN)
        log.IS_SHOW_TRACE = self.is_show_trace = analysis_config(
            config, "IS_SHOW_TRACE", False, CONFIG_ANALYSIS_MODE_BOOLEAN)
        error_log_path = analysis_config(config, "ERROR_LOG_PATH",
                                         "\\log/errorLog.txt",
                                         CONFIG_ANALYSIS_MODE_PATH)
        log.ERROR_LOG_PATH = self.error_log_path = replace_path(error_log_path)
        error_log_dir = os.path.dirname(self.error_log_path)
        if not path.create_dir(error_log_dir):
            self.print_msg("创建错误日志目录 %s 失败" % error_log_dir)
            tool.process_exit()
            return
        is_log_step = analysis_config(config, "IS_LOG_STEP", True,
                                      CONFIG_ANALYSIS_MODE_BOOLEAN)
        if not is_log_step:
            log.STEP_LOG_PATH = self.step_log_path = ""
        else:
            step_log_path = analysis_config(config, "STEP_LOG_PATH",
                                            "\\log/stepLog.txt",
                                            CONFIG_ANALYSIS_MODE_PATH)
            log.STEP_LOG_PATH = self.step_log_path = replace_path(
                step_log_path)
            # 日志文件保存目录
            step_log_dir = os.path.dirname(self.step_log_path)
            if not path.create_dir(step_log_dir):
                self.print_msg("创建步骤日志目录 %s 失败" % step_log_dir)
                tool.process_exit()
                return
        is_log_trace = analysis_config(config, "IS_LOG_TRACE", True,
                                       CONFIG_ANALYSIS_MODE_BOOLEAN)
        if not is_log_trace:
            log.TRACE_LOG_PATH = self.trace_log_path = ""
        else:
            trace_log_path = analysis_config(config, "TRACE_LOG_PATH",
                                             "\\log/traceLog.txt",
                                             CONFIG_ANALYSIS_MODE_PATH)
            log.TRACE_LOG_PATH = self.trace_log_path = replace_path(
                trace_log_path)
            # 日志文件保存目录
            trace_log_dir = os.path.dirname(self.trace_log_path)
            if not path.create_dir(trace_log_dir):
                self.print_msg("创建调试日志目录 %s 失败" % trace_log_dir)
                tool.process_exit()
                return

        # 是否下载
        self.is_download_image = analysis_config(
            config, "IS_DOWNLOAD_IMAGE", True,
            CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_image
        self.is_download_video = analysis_config(
            config, "IS_DOWNLOAD_VIDEO", True,
            CONFIG_ANALYSIS_MODE_BOOLEAN) and sys_download_video

        if not sys_not_download and not self.is_download_image and not self.is_download_video:
            if sys_download_image or sys_download_video:
                self.print_msg("所有支持的下载都没有开启，请检查配置！")
                tool.process_exit()
                return

        # 存档
        self.save_data_path = analysis_config(config, "SAVE_DATA_PATH",
                                              "\\\\info/save.data",
                                              CONFIG_ANALYSIS_MODE_PATH)
        if not sys_not_check_save_data and not os.path.exists(
                self.save_data_path):
            # 存档文件不存在
            self.print_msg("存档文件%s不存在！" % self.save_data_path)
            tool.process_exit()
            return
        temp_file_name = time.strftime(
            "%m-%d_%H_%M_", time.localtime(time.time())) + os.path.basename(
                self.save_data_path)
        self.temp_save_data_path = os.path.join(
            os.path.dirname(self.save_data_path), temp_file_name)
        if os.path.exists(self.temp_save_data_path):
            # 临时文件已存在
            self.print_msg("存档临时文件%s已存在！" % self.temp_save_data_path)
            tool.process_exit()
            return

        # session
        self.session_data_path = analysis_config(config, "SESSION_DATA_PATH",
                                                 "\\\\info/session.data",
                                                 CONFIG_ANALYSIS_MODE_PATH)

        # 是否需要下载图片
        if self.is_download_image:
            # 图片保存目录
            self.image_download_path = analysis_config(
                config, "IMAGE_DOWNLOAD_PATH", "\\\\photo",
                CONFIG_ANALYSIS_MODE_PATH)
            if not path.create_dir(self.image_download_path):
                # 图片保存目录创建失败
                self.print_msg("图片保存目录%s创建失败！" % self.image_download_path)
                tool.process_exit()
                return
        else:
            self.image_download_path = ""
        # 是否需要下载视频
        if self.is_download_video:
            # 视频保存目录
            self.video_download_path = analysis_config(
                config, "VIDEO_DOWNLOAD_PATH", "\\\\video",
                CONFIG_ANALYSIS_MODE_PATH)
            if not path.create_dir(self.video_download_path):
                # 视频保存目录创建失败
                self.print_msg("视频保存目录%s创建失败！" % self.video_download_path)
                tool.process_exit()
                return
        else:
            self.video_download_path = ""

        # 代理
        is_proxy = analysis_config(config, "IS_PROXY", 2,
                                   CONFIG_ANALYSIS_MODE_INTEGER)
        if is_proxy == 1 or (is_proxy == 2 and sys_set_proxy):
            proxy_ip = analysis_config(config, "PROXY_IP", "127.0.0.1")
            proxy_port = analysis_config(config, "PROXY_PORT", "8087")
            # 使用代理的线程池
            net.set_proxy(proxy_ip, proxy_port)
        else:
            # 初始化urllib3的线程池
            net.init_http_connection_pool()

        # cookies
        self.cookie_value = {}
        if sys_get_cookie:
            # 操作系统&浏览器
            browser_type = analysis_config(config, "BROWSER_TYPE", 2,
                                           CONFIG_ANALYSIS_MODE_INTEGER)
            # cookie
            is_auto_get_cookie = analysis_config(config, "IS_AUTO_GET_COOKIE",
                                                 True,
                                                 CONFIG_ANALYSIS_MODE_BOOLEAN)
            if is_auto_get_cookie:
                cookie_path = browser.get_default_browser_cookie_path(
                    browser_type)
            else:
                cookie_path = analysis_config(config, "COOKIE_PATH", "")
            all_cookie_from_browser = browser.get_all_cookie_from_browser(
                browser_type, cookie_path)
            for cookie_domain in sys_config[SYS_GET_COOKIE]:
                # 如果指定了cookie key
                if sys_config[SYS_GET_COOKIE][cookie_domain]:
                    for cookie_key in sys_config[SYS_GET_COOKIE][
                            cookie_domain]:
                        self.cookie_value[cookie_key] = ""
                    if cookie_domain in all_cookie_from_browser:
                        for cookie_name in self.cookie_value:
                            if cookie_name in all_cookie_from_browser[
                                    cookie_domain]:
                                self.cookie_value[
                                    cookie_name] = all_cookie_from_browser[
                                        cookie_domain][cookie_name]
                # 没有指定cookie key那么就是取全部
                else:
                    if cookie_domain in all_cookie_from_browser:
                        for cookie_name in all_cookie_from_browser[
                                cookie_domain]:
                            self.cookie_value[
                                cookie_name] = all_cookie_from_browser[
                                    cookie_domain][cookie_name]

        # Http Setting
        net.HTTP_CONNECTION_TIMEOUT = analysis_config(
            config, "HTTP_CONNECTION_TIMEOUT", 10,
            CONFIG_ANALYSIS_MODE_INTEGER)
        net.HTTP_READ_TIMEOUT = analysis_config(config, "HTTP_READ_TIMEOUT",
                                                10,
                                                CONFIG_ANALYSIS_MODE_INTEGER)
        net.HTTP_DOWNLOAD_CONNECTION_TIMEOUT = analysis_config(
            config, "HTTP_DOWLOAD_CONNECTION_TIMEOUT", 10,
            CONFIG_ANALYSIS_MODE_INTEGER)
        net.HTTP_DOWNLOAD_READ_TIMEOUT = analysis_config(
            config, "HTTP_DOWLOAD_READ_TIMEOUT", 60,
            CONFIG_ANALYSIS_MODE_INTEGER)
        net.HTTP_REQUEST_RETRY_COUNT = analysis_config(
            config, "HTTP_REQUEST_RETRY_COUNT", 10,
            CONFIG_ANALYSIS_MODE_INTEGER)

        # 线程数
        self.thread_count = analysis_config(config, "THREAD_COUNT", 10,
                                            CONFIG_ANALYSIS_MODE_INTEGER)
        self.thread_lock = threading.Lock()  # 线程锁，避免操作一些全局参数
        self.thread_condition = threading.Condition(
        )  # 线程数达到上限时等待wait()，直到任意线程唤醒notify()

        # 启用线程监控是否需要暂停其他下载线程
        if analysis_config(config, "IS_PORT_LISTENER_ENVET", True,
                           CONFIG_ANALYSIS_MODE_BOOLEAN):
            listener_event_bind = {}
            # 暂停进程
            listener_event_bind[str(
                portListenerEvent.PROCESS_STATUS_PAUSE)] = net.pause_request
            # 继续进程
            listener_event_bind[str(
                portListenerEvent.PROCESS_STATUS_RUN)] = net.resume_request
            # 结束进程（取消当前的线程，完成任务）
            listener_event_bind[str(
                portListenerEvent.PROCESS_STATUS_STOP)] = self.stop_process

            listener_port = analysis_config(config, "LISTENER_PORT", 12345,
                                            CONFIG_ANALYSIS_MODE_INTEGER)
            process_control_thread = portListenerEvent.PortListenerEvent(
                port=listener_port, event_list=listener_event_bind)
            process_control_thread.setDaemon(True)
            process_control_thread.start()

        # 键盘监控线程（仅支持windows）
        if platform.system() == "Windows" and analysis_config(
                config, "IS_KEYBOARD_EVENT", True,
                CONFIG_ANALYSIS_MODE_BOOLEAN):
            keyboard_event_bind = {}
            pause_process_key = analysis_config(config,
                                                "PAUSE_PROCESS_KEYBOARD_KEY",
                                                "F9")
            # 暂停进程
            if pause_process_key:
                keyboard_event_bind[pause_process_key] = net.pause_request
            # 继续进程
            continue_process_key = analysis_config(
                config, "CONTINUE_PROCESS_KEYBOARD_KEY", "F10")
            if continue_process_key:
                keyboard_event_bind[continue_process_key] = net.resume_request
            # 结束进程（取消当前的线程，完成任务）
            stop_process_key = analysis_config(config,
                                               "STOP_PROCESS_KEYBOARD_KEY",
                                               "CTRL + F12")
            if stop_process_key:
                keyboard_event_bind[stop_process_key] = self.stop_process

            if keyboard_event_bind:
                keyboard_control_thread = keyboardEvent.KeyboardEvent(
                    keyboard_event_bind)
                keyboard_control_thread.setDaemon(True)
                keyboard_control_thread.start()

        self.total_image_count = 0
        self.total_video_count = 0

        self.print_msg("初始化完成")

예제 #45

파일 보기

파일: miaopai.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_VIDEO_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 4 and self.account_info[3]:
            account_name = self.account_info[3]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                video_path = os.path.join(VIDEO_TEMP_PATH, account_name)
            else:
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)

            suid = get_suid(account_id)
            if suid is None:
                log.error(account_name + " suid获取失败")

            page_count = 1
            video_count = 1
            first_video_scid = ""
            unique_list = []
            is_over = False
            need_make_download_dir = True
            while suid != "" and (not is_over):
                # 获取指定一页的视频信息
                media_page = get_one_page_video_data(suid, page_count)
                if media_page is None:
                    log.error(account_name + " 视频列表获取失败")
                    tool.process_exit()

                # 获取视频scid列表
                scid_list = get_scid_list(media_page["msg"])
                if len(scid_list) == 0:
                    log.error(account_name + " 在视频列表：%s 中没有找到视频scid" % str(media_page["msg"]))
                    tool.process_exit()

                for scid in scid_list:
                    scid = str(scid)

                    # 检查是否已下载到前一次的图片
                    if first_video_scid == self.account_info[2]:
                        is_over = True
                        break

                    # 新增视频导致的重复判断
                    if scid in unique_list:
                        continue
                    else:
                        unique_list.append(scid)
                    # 将第一个视频的id做为新的存档记录
                    if first_video_scid == "":
                        first_video_scid = scid

                    # 获取视频下载地址
                    video_url = get_video_url_by_video_id(scid)
                    if video_url is None:
                        log.error(account_name + " 第%s个视频 %s 获取下载地址失败" % (video_count, scid))
                        continue

                    log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_url))

                    # 第一个视频，创建目录
                    if need_make_download_dir:
                        if not tool.make_dir(video_path, 0):
                            log.error(account_name + " 创建视频下载目录 %s 失败" % video_path)
                            tool.process_exit()
                        need_make_download_dir = False

                    file_path = os.path.join(video_path, "%04d.mp4" % video_count)
                    if tool.save_net_file(video_url, file_path):
                        log.step(account_name + " 第%s个视频下载成功" % video_count)
                        video_count += 1
                    else:
                        log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url))

                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_VIDEO_COUNT < video_count:
                        is_over = True
                        break

                if not is_over:
                    if media_page["isall"]:
                        is_over = True
                    else:
                        page_count += 1

            log.step(account_name + " 下载完毕，总共获得%s个视频" % (video_count - 1))

            # 排序
            if IS_SORT and video_count > 1:
                destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)
                if robot.sort_file(video_path, destination_path, int(self.account_info[1]), 4):
                    log.step(account_name + " 视频从下载目录移动到保存目录成功")
                else:
                    log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path)
                    tool.process_exit()

            # 新的存档记录
            if first_video_scid != "":
                self.account_info[1] = str(int(self.account_info[1]) + video_count - 1)
                self.account_info[2] = first_video_scid

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")

예제 #46

파일 보기

파일: yizhibo.py 프로젝트: yxw19870806/yxw19870806

    def run(self):
        global TOTAL_IMAGE_COUNT
        global TOTAL_VIDEO_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 6 and self.account_info[5]:
            account_name = self.account_info[5]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹，否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
                video_path = os.path.join(VIDEO_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)

            image_count = 1
            first_image_time = "0"
            need_make_image_dir = True
            while IS_DOWNLOAD_IMAGE:
                # 获取全部图片地址列表
                image_url_list = get_image_url_list(account_id)
                if image_url_list is None:
                    log.error(account_name + " 图片列表获取失败")
                    break

                for image_url in list(image_url_list):
                    # 不使用缩略图
                    image_url = image_url.split("@")[0]
                    image_return_code, image_byte, image_response = tool.http_request(image_url)
                    if image_return_code != 1:
                        log.step(account_name + " 第%s张图片下载失败" % image_count)
                        continue

                    # 获取图片的上传时间（字符串）
                    response_last_modified_time = tool.get_response_info(image_response.info(), "Last-Modified")
                    # 字符串转换为时间戳
                    image_created_time = tool.response_time_to_timestamp(response_last_modified_time)

                    # 检查是否已下载到前一次的图片
                    if int(image_created_time) <= int(self.account_info[4]):
                        break

                    # 将第一张图片的上传时间做为新的存档记录
                    if first_image_time == "0":
                        first_image_time = str(image_created_time)

                    log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url))

                    # 第一张图片，创建目录
                    if need_make_image_dir:
                        if not tool.make_dir(image_path, 0):
                            log.error(account_name + " 创建图片下载目录 %s 失败" % image_path)
                            tool.process_exit()
                        need_make_image_dir = False

                    file_type = image_url.split(".")[-1].split(":")[0]
                    image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                    save_image(image_byte, image_file_path)
                    log.step(account_name + " 第%s张图片下载成功" % image_count)
                    image_count += 1

                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_IMAGE_COUNT < image_count:
                        break
                break

            # 视频
            video_count = 1
            first_video_time = "0"
            need_make_video_dir = True
            while IS_DOWNLOAD_VIDEO:
                # 获取全部视频ID列表
                video_id_list = get_video_id_list(account_id)
                if video_id_list is None:
                    log.error(account_name + " 视频列表获取失败")
                    break

                for video_id in list(video_id_list):
                    # 获取视频的时间和下载地址
                    video_info = get_video_info(video_id)
                    if video_info is None:
                        log.error(account_name + " 第%s个视频 %s 信息获取失败" % (video_count, video_id))
                        continue

                    # 检查是否已下载到前一次的视频
                    if int(video_info["data"]["createtime"]) <= int(self.account_info[2]):
                        break

                    # 将第一个视频的上传时间做为新的存档记录
                    if first_video_time == "0":
                        first_video_time = str(video_info["data"]["createtime"])

                    # m3u8文件的地址
                    link_url = str(video_info["data"]["linkurl"])
                    # 视频的真实下载地址列表
                    ts_url_list = get_ts_url_list(link_url)
                    if ts_url_list is None:
                        log.error(account_name + " 第%s个视频下载地址列表 %s 获取失败" % (video_count, link_url))
                        continue

                    log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, ts_url_list))

                    # 第一个视频，创建目录
                    if need_make_video_dir:
                        if not tool.make_dir(video_path, 0):
                            log.error(account_name + " 创建图片下载目录 %s 失败" % video_path)
                            tool.process_exit()
                        need_make_video_dir = False

                    video_file_path = os.path.join(video_path, "%04d.ts" % video_count)
                    if save_video(ts_url_list, video_file_path):
                        log.step(account_name + " 第%s个视频下载成功" % video_count)
                        video_count += 1
                    else:
                        log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, ts_url_list))

                    # 达到配置文件中的下载数量，结束
                    if 0 < GET_VIDEO_COUNT < video_count:
                        break
                break

            log.step(account_name + " 下载完毕，总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1))

            # 排序
            if IS_SORT:
                if image_count > 1:
                    destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(image_path, destination_path, int(self.account_info[3]), 4):
                        log.step(account_name + " 图片从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path)
                        tool.process_exit()
                if video_count > 1:
                    destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(video_path, destination_path, int(self.account_info[1]), 4):
                        log.step(account_name + " 视频从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            if first_image_time != "0":
                self.account_info[3] = str(int(self.account_info[3]) + image_count - 1)
                self.account_info[4] = first_image_time

            if first_video_time != "0":
                self.account_info[1] = str(int(self.account_info[1]) + video_count - 1)
                self.account_info[2] = first_video_time

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")