Example #1
0
    def main(self):
        global ACCOUNTS

        # todo 存档文件格式
        # 解析存档文件
        # account_id
        account_list = robot.read_save_data(self.save_data_path, 0, ["", ])
        ACCOUNTS = account_list.keys()

        # 循环下载每个id
        main_thread_count = threading.activeCount()
        for account_id in sorted(account_list.keys()):
            # 检查正在运行的线程数
            while threading.activeCount() >= self.thread_count + main_thread_count:
                if robot.is_process_end() == 0:
                    time.sleep(10)
                else:
                    break

            # 提前结束
            if robot.is_process_end() > 0:
                break

            # 开始下载
            thread = Download(account_list[account_id], self.thread_lock)
            thread.start()

            time.sleep(1)

        # 检查除主线程外的其他所有线程是不是全部结束了
        while threading.activeCount() > main_thread_count:
            time.sleep(10)

        # 未完成的数据保存
        if len(ACCOUNTS) > 0:
            new_save_data_file = open(NEW_SAVE_DATA_PATH, "a")
            for account_id in ACCOUNTS:
                new_save_data_file.write("\t".join(account_list[account_id]) + "\n")
            new_save_data_file.close()

        # todo 是否需要下载图片或视频
        # 删除临时文件夹
        tool.remove_dir(IMAGE_TEMP_PATH)
        tool.remove_dir(VIDEO_TEMP_PATH)

        # 重新排序保存存档文件
        robot.rewrite_save_file(NEW_SAVE_DATA_PATH, self.save_data_path)

        # todo 是否需要下载图片或视频
        log.step("全部下载完毕,耗时%s秒,共计图片%s张,视频%s个" % (self.get_run_time(), TOTAL_IMAGE_COUNT, TOTAL_VIDEO_COUNT))
Example #2
0
    def main(self):
        global ACCOUNTS

        # 解析存档文件
        # account_name  image_count  video_count  last_created_time
        account_list = robot.read_save_data(self.save_data_path, 0, ["", "0", "0", "0"])
        ACCOUNTS = account_list.keys()

        if not set_csrf_token():
            log.error("token和session获取查找失败")
            tool.process_exit()

        # 循环下载每个id
        main_thread_count = threading.activeCount()
        for account_name in sorted(account_list.keys()):
            # 检查正在运行的线程数
            while threading.activeCount() >= self.thread_count + main_thread_count:
                if robot.is_process_end() == 0:
                    time.sleep(10)
                else:
                    break

            # 提前结束
            if robot.is_process_end() > 0:
                break

            # 开始下载
            thread = Download(account_list[account_name], self.thread_lock)
            thread.start()

            time.sleep(1)

        # 检查除主线程外的其他所有线程是不是全部结束了
        while threading.activeCount() > main_thread_count:
            time.sleep(10)

        # 未完成的数据保存
        if len(ACCOUNTS) > 0:
            new_save_data_file = open(NEW_SAVE_DATA_PATH, "a")
            for account_name in ACCOUNTS:
                new_save_data_file.write("\t".join(account_list[account_name]) + "\n")
            new_save_data_file.close()

        # 删除临时文件夹
        self.finish_task()

        # 重新排序保存存档文件
        robot.rewrite_save_file(NEW_SAVE_DATA_PATH, self.save_data_path)

        log.step("全部下载完毕,耗时%s秒,共计图片%s张,视频%s个" % (self.get_run_time(), TOTAL_IMAGE_COUNT, TOTAL_VIDEO_COUNT))
Example #3
0
    def main(self):
        global ACCOUNTS

        # 解析存档文件
        # account_id  image_count  last_image_time  video_count  last_video_url  (account_name)
        account_list = robot.read_save_data(self.save_data_path, 0, ["", "0", "0", "0", ""])
        ACCOUNTS = account_list.keys()

        # 先访问下页面,产生cookies
        auto_redirect_visit("http://www.weibo.com/")
        time.sleep(2)

        # 循环下载每个id
        main_thread_count = threading.activeCount()
        for account_id in sorted(account_list.keys()):
            # 检查正在运行的线程数
            while threading.activeCount() >= self.thread_count + main_thread_count:
                if robot.is_process_end() == 0:
                    time.sleep(10)
                else:
                    break

            # 提前结束
            if robot.is_process_end() > 0:
                break

            # 开始下载
            thread = Download(account_list[account_id], self.thread_lock)
            thread.start()

            time.sleep(1)

        # 检查除主线程外的其他所有线程是不是全部结束了
        while threading.activeCount() > main_thread_count:
            time.sleep(10)

        # 未完成的数据保存
        if len(ACCOUNTS) > 0:
            new_save_data_file = open(NEW_SAVE_DATA_PATH, "a")
            for account_id in ACCOUNTS:
                new_save_data_file.write("\t".join(account_list[account_id]) + "\n")
            new_save_data_file.close()

        # 删除临时文件夹
        self.finish_task()

        # 重新排序保存存档文件
        robot.rewrite_save_file(NEW_SAVE_DATA_PATH, self.save_data_path)

        log.step("全部下载完毕,耗时%s秒,共计图片%s张,视频%s个" % (self.get_run_time(), TOTAL_IMAGE_COUNT, TOTAL_VIDEO_COUNT))
Example #4
0
    def main(self):
        global ACCOUNTS

        # 检测登录状态
        # 未登录时提示可能无法获取粉丝指定的作品
        check_login()

        # 解析存档文件
        # account_id  last_rp_id
        account_list = robot.read_save_data(self.save_data_path, 0, ["", "0"])
        ACCOUNTS = account_list.keys()

        # 循环下载每个id
        main_thread_count = threading.activeCount()
        for account_id in sorted(account_list.keys()):
            # 检查正在运行的线程数
            while threading.activeCount() >= self.thread_count + main_thread_count:
                if robot.is_process_end() == 0:
                    time.sleep(10)
                else:
                    break

            # 提前结束
            if robot.is_process_end() > 0:
                break

            # 开始下载
            thread = Download(account_list[account_id], self.thread_lock)
            thread.start()

            time.sleep(1)

        # 检查除主线程外的其他所有线程是不是全部结束了
        while threading.activeCount() > main_thread_count:
            time.sleep(10)

        # 未完成的数据保存
        if len(ACCOUNTS) > 0:
            new_save_data_file = open(NEW_SAVE_DATA_PATH, "a")
            for account_id in ACCOUNTS:
                new_save_data_file.write("\t".join(account_list[account_id]) + "\n")
            new_save_data_file.close()

        # 重新排序保存存档文件
        robot.rewrite_save_file(NEW_SAVE_DATA_PATH, self.save_data_path)

        log.step("全部下载完毕,耗时%s秒,共计图片%s张" % (self.get_run_time(), TOTAL_IMAGE_COUNT))
Example #5
0
    def main(self):
        # 解析存档文件
        # 寻找fkoji.save
        account_list = robot.read_save_data(self.save_data_path, 0, ["", "", ""])

        # 这个key的内容为总数据
        if ALL_SIGN in account_list:
            image_start_index = int(account_list[ALL_SIGN][1])
            save_data_image_time = int(account_list[ALL_SIGN][2])
            account_list.pop(ALL_SIGN)
        else:
            image_start_index = 0
            save_data_image_time = 0

        if self.is_sort:
            image_path = self.image_temp_path
        else:
            image_path = self.image_download_path

        if not tool.make_dir(image_path, 0):
            # 图片保存目录创建失败
            self.print_msg("图片下载目录%s创建失败!" % self.image_download_path)
            tool.process_exit()

        # 下载
        page_index = 1
        image_count = 1
        first_image_time = 0
        unique_list = []
        is_over = False

        while not is_over:
            index_url = "http://jigadori.fkoji.com/?p=%s" % page_index
            index_page_return_code, index_page_response = tool.http_request(index_url)[:2]
            if index_page_return_code != 1:
                log.error("无法访问首页地址 %s" % index_url)
                tool.process_exit()

            index_page = BeautifulSoup.BeautifulSoup(index_page_response)
            photo_list = index_page.body.findAll("div", "photo")
            # 已经下载到最后一页
            if not photo_list:
                break
            for photo_info in photo_list:
                if isinstance(photo_info, BeautifulSoup.NavigableString):
                    continue

                # 从图片页面中解析获取推特发布时间的时间戳
                tweet_created_time = get_tweet_created_time(photo_info)
                if tweet_created_time is None:
                    log.error("第%s张图片,解析tweet-created-at失败" % image_count)
                    continue

                # 下载完毕
                if tweet_created_time <= save_data_image_time:
                    is_over = True
                    break

                # 将第一张图片的上传时间做为新的存档记录
                if first_image_time == 0:
                    first_image_time = tweet_created_time

                # 从图片页面中解析获取推特发布账号
                account_id = get_tweet_account_id(photo_info)
                if account_id is None:
                    log.error("第%s张图片,解析tweet账号失败" % image_count)
                    continue

                # 找图片
                img_tags = photo_info.findAll("img")
                for tag in img_tags:
                    tag_attr = dict(tag.attrs)
                    if robot.check_sub_key(("src", "alt"), tag_attr):
                        image_url = str(tag_attr["src"]).replace(" ", "")
                        # 新增图片导致的重复判断
                        if image_url in unique_list:
                            continue
                        else:
                            unique_list.append(image_url)

                        log.step("开始下载第%s张图片 %s" % (image_count, image_url))

                        file_type = image_url.split(".")[-1]
                        if file_type.find("/") != -1:
                            file_type = "jpg"
                        file_path = os.path.join(image_path, "%05d_%s.%s" % (image_count, account_id, file_type))
                        if tool.save_net_file(image_url, file_path):
                            log.step("第%s张图片下载成功" % image_count)
                            image_count += 1
                        else:
                            log.error("第%s张图片 %s,account_id:%s,下载失败" % (image_count, image_url, account_id))
                if is_over:
                    break

            if not is_over:
                page_index += 1

        log.step("下载完毕")

        # 排序复制到保存目录
        if self.is_sort:
            is_check_ok = False
            while not is_check_ok:
                # 等待手动检测所有图片结束
                input_str = raw_input(tool.get_time() + " 已经下载完毕,是否下一步操作? (Y)es or (N)o: ")
                input_str = input_str.lower()
                if input_str in ["y", "yes"]:
                    is_check_ok = True
                elif input_str in ["n", "no"]:
                    tool.process_exit()

            all_path = os.path.join(self.image_download_path, "all")
            if not tool.make_dir(all_path, 0):
                log.error("创建目录 %s 失败" % all_path)
                tool.process_exit()

            file_list = tool.get_dir_files_name(self.image_temp_path, "desc")
            for file_name in file_list:
                image_path = os.path.join(self.image_temp_path, file_name)
                file_name_list = file_name.split(".")
                file_type = file_name_list[-1]
                account_id = "_".join(".".join(file_name_list[:-1]).split("_")[1:])

                # 所有
                image_start_index += 1
                destination_file_name = "%05d_%s.%s" % (image_start_index, account_id, file_type)
                destination_path = os.path.join(all_path, destination_file_name)
                tool.copy_files(image_path, destination_path)

                # 单个
                each_account_path = os.path.join(self.image_download_path, "single", account_id)
                if not os.path.exists(each_account_path):
                    if not tool.make_dir(each_account_path, 0):
                        log.error("创建目录 %s 失败" % each_account_path)
                        tool.process_exit()
                if account_id in account_list:
                    account_list[account_id][1] = int(account_list[account_id][1]) + 1
                else:
                    account_list[account_id] = [account_id, 1]
                destination_file_name = "%05d.%s" % (account_list[account_id][1], file_type)
                destination_path = os.path.join(each_account_path, destination_file_name)
                tool.copy_files(image_path, destination_path)

            log.step("图片从下载目录移动到保存目录成功")

            # 删除临时文件夹
            tool.remove_dir(self.image_temp_path)

        # 保存新的存档文件
        temp_list = [account_list[key] for key in sorted(account_list.keys())]
        # 把总数据插入列表头
        temp_list.insert(0, [ALL_SIGN, str(image_start_index), str(first_image_time)])
        tool.write_file(tool.list_to_string(temp_list), self.save_data_path, 2)

        log.step("全部下载完毕,耗时%s秒,共计图片%s张" % (self.get_run_time(), image_count - 1))