Ejemplo n.º 1
0
 def trace(self, message, include_display_name=True):
     """
     trace log
     """
     if include_display_name and self.display_name is not None:
         message = self.display_name + " " + message
     log.trace(message)
Ejemplo n.º 2
0
    def main(self):
        page_count = 1
        image_count = 0
        main_thread_count = threading.activeCount()
        # 多线程下载类型
        # 1 同时开始N个下载线程
        # 2 对一页中的所有图片开启多线程下载,下完一页中的所有图片后开始下一页
        thread_type = 2
        while True:
            # 获取一页页面
            page_data = get_one_page_data(page_count)
            if page_data is None:
                log.error("第%s页获取失败" % page_count)
                break

            # 获取页面中的所有图片信息列表
            image_info_list = re.findall('<img src="" data-original="([^"]*)" class="lazy img" title="([^"]*)">', page_data)
            # 获取页面中的影片数量
            page_data_count = page_data.count('<div class="item pull-left">')

            # 已经下载完毕了
            if page_data_count == 0:
                break
            log.step("第%s页,影片数量%s,获取到的封面图片数量%s" % (page_count, len(image_info_list), page_data_count))

            for small_image_url, title in image_info_list:
                # 达到线程上限,等待
                while thread_type == 1 and threading.activeCount() >= self.thread_count + main_thread_count:
                    time.sleep(5)

                title = robot.filter_text(str(title)).upper()
                image_url = get_large_image_url(small_image_url)
                if image_url is None:
                    log.trace("%s的封面图片大图地址获取失败" % title)
                    continue

                log.step("开始下载%s的封面图片 %s" % (title, image_url))

                file_type = image_url.split(".")[-1]
                file_path = os.path.join(self.image_download_path, "%s.%s" % (title, file_type))
                file_temp_path = os.path.join(self.image_download_path, "%s_temp.%s" % (title, file_type))

                # 开始下载
                thread = Download(self.thread_lock, title, file_path, file_temp_path, image_url)
                thread.start()
                time.sleep(0.1)

            # 还有未完成线程
            while thread_type == 2 and threading.activeCount() > main_thread_count:
                time.sleep(5)

            page_count += 1

        log.step("全部下载完毕,耗时%s秒,共计图片%s张" % (self.get_run_time(), TOTAL_IMAGE_COUNT))
Ejemplo n.º 3
0
def get_dbsnp(data, region, force=False):
    mv = myvariant.MyVariantInfo()
    q = mv.query(
        '_exists_:dbsnp AND _exists_:hg19 AND {}:{}-{}'.format(*region),
        fields='dbsnp',
        fetch_all=True)
    snps = list(q)

    # VCF, dbSNP and myVariant use 1-based indexing
    dbsnp = collections.defaultdict(dict)
    for snp in snps:
        pos, ref, alt, rs = snp['dbsnp']['hg19']['start'] - 1, snp['dbsnp'][
            'ref'], snp['dbsnp']['alt'], snp['dbsnp']['rsid']
        if len(ref) > 1 or len(alt) > 1:
            assert (ref[0] == alt[0])
        if len(ref) > 1:
            op = 'DEL.{}'.format(ref[1:])
        elif len(alt) > 1:
            op = 'INS.{}'.format(alt[1:].lower())
        else:
            op = 'SNP.{}{}'.format(ref, alt)
        dbsnp[pos][op] = rs

    mutations = {}
    for a in sorted(data):
        for m in data[a]['mutations']:
            if m['pos'] == 'pseudogene': continue
            if m['dbsnp'] not in ['', '*']:
                m['dbsnp'] = [m['dbsnp']]
            else:
                m['dbsnp'] = []
            pos, op = m['pos'], m['op']

            # check reversed SNP
            if op in dbsnp[pos]:
                rsid = str(dbsnp[pos][op])
                if rsid not in m['dbsnp']:
                    if len(m['dbsnp']) > 0: m['dbsnp'][0] += '(k)'
                    m['dbsnp'].append(rsid)
                    log.debug('dbSNP: Variant {} assigned to {}:{}', rsid, pos,
                              op)
                else:
                    log.debug(
                        'dbSNP: Variant {} matches the Karolinska\'s prediction',
                        rsid)
            elif len(dbsnp[pos]) > 0 and (op[:3] == 'SNP' and op[:4] +
                                          op[4:6][::-1] in dbsnp[pos]):
                op = op[:4] + op[4:6][::-1]
                rsid = str(dbsnp[pos][op])
                if rsid not in m['dbsnp']:
                    if len(m['dbsnp']) > 0: m['dbsnp'][0] += '(k)'
                    m['dbsnp'].append(rsid)
                    log.debug('dbSNP: Variant {} assigned to {}:{}', rsid, pos,
                              op)
                else:
                    log.debug(
                        'dbSNP: Variant {} matches the Karolinska\'s prediction',
                        rsid)
            elif len(dbsnp[pos]) != 0:
                log.trace('How about {} for {}:{} ({})', dbsnp[pos], pos, op,
                          m['old'])
    return data
Ejemplo n.º 4
0
    def run(self):
        global TOTAL_IMAGE_COUNT

        coser_id = self.account_info[0]
        if len(self.account_info) >= 3:
            cn = self.account_info[2]
        else:
            cn = self.account_info[0]

        try:
            log.step(cn + " 开始")

            image_path = os.path.join(IMAGE_DOWNLOAD_PATH, cn)

            # 图片下载
            this_cn_total_image_count = 0
            page_count = 1
            total_rp_count = 1
            first_rp_id = ""
            unique_list = []
            is_over = False
            need_make_download_dir = True  # 是否需要创建cn目录
            while not is_over:
                # 获取一页的作品信息
                post_page = get_one_page_post(coser_id, page_count)
                if post_page is None:
                    log.error(cn + " 无法访问第%s页作品" % page_count)
                    tool.process_exit()

                # 解析作品信息,获取所有的正片信息
                cp_id, rp_list = get_rp_list(post_page)
                if cp_id is None:
                    log.error(cn + " 第%s页作品解析异常" % page_count)
                    tool.process_exit()

                for rp_id, title in rp_list.iteritems():
                    # 检查是否已下载到前一次的图片
                    if int(rp_id) <= int(self.account_info[1]):
                        is_over = True
                        break

                    # 新增正片导致的重复判断
                    if rp_id in unique_list:
                        continue
                    else:
                        unique_list.append(rp_id)
                    # 将第一个作品的id做为新的存档记录
                    if first_rp_id == "":
                        first_rp_id = rp_id

                    log.trace("rp: " + rp_id)

                    if need_make_download_dir:
                        if not tool.make_dir(image_path, 0):
                            log.error(cn + " 创建CN目录 %s 失败" % image_path)
                            tool.process_exit()
                        need_make_download_dir = False

                    # 过滤标题中不支持的字符
                    title = robot.filter_text(title)
                    if title:
                        rp_path = os.path.join(image_path, "%s %s" % (rp_id, title))
                    else:
                        rp_path = os.path.join(image_path, rp_id)
                    if not tool.make_dir(rp_path, 0):
                        # 目录出错,把title去掉后再试一次,如果还不行退出
                        log.error(cn + " 创建作品目录 %s 失败,尝试不使用title" % rp_path)
                        rp_path = os.path.join(image_path, rp_id)
                        if not tool.make_dir(rp_path, 0):
                            log.error(cn + " 创建作品目录 %s 失败" % rp_path)
                            tool.process_exit()

                    # 获取正片页面内的所有图片地址列表
                    image_url_list = get_image_url_list(cp_id, rp_id)
                    if image_url_list is None:
                        log.error(cn + " 无法访问正片:%s,cp_id:%s" % (rp_id, cp_id))
                        continue

                    if len(image_url_list) == 0 and IS_AUTO_FOLLOW:
                        log.step(cn + " 检测到可能有私密作品且账号不是ta的粉丝,自动关注")
                        if follow(coser_id):
                            # 重新获取下正片页面内的所有图片地址列表
                            image_url_list = get_image_url_list(cp_id, rp_id)

                    if len(image_url_list) == 0:
                        log.error(cn + " 正片:%s没有任何图片,可能是你使用的账号没有关注ta,所以无法访问只对粉丝开放的私密作品,cp_id:%s" % (rp_id, cp_id))
                        continue

                    image_count = 1
                    for image_url in list(image_url_list):
                        # 禁用指定分辨率
                        image_url = "/".join(image_url.split("/")[0:-1])
                        log.step(cn + " %s 开始下载第%s张图片 %s" % (rp_id, image_count, image_url))

                        if image_url.rfind("/") < image_url.rfind("."):
                            file_type = image_url.split(".")[-1]
                        else:
                            file_type = "jpg"
                        file_path = os.path.join(rp_path, "%03d.%s" % (image_count, file_type))
                        if tool.save_net_file(image_url, file_path):
                            image_count += 1
                            log.step(cn + " %s 第%s张图片下载成功" % (rp_id, image_count))
                        else:
                            log.error(cn + " %s 第%s张图片 %s 下载失败" % (rp_id, image_count, image_url))

                    this_cn_total_image_count += image_count - 1

                    if 0 < GET_PAGE_COUNT < total_rp_count:
                        is_over = True
                        break
                    else:
                        total_rp_count += 1

                if not is_over:
                    if page_count >= get_max_page_count(coser_id, post_page):
                        is_over = True
                    else:
                        page_count += 1

            log.step(cn + " 下载完毕,总共获得%s张图片" % this_cn_total_image_count)

            # 新的存档记录
            if first_rp_id != "":
                self.account_info[1] = first_rp_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += this_cn_total_image_count
            ACCOUNTS.remove(coser_id)
            self.thread_lock.release()

            log.step(cn + " 完成")
        except SystemExit:
            log.error(cn + " 异常退出")
        except Exception, e:
            log.error(cn + " 未知异常")
            log.error(str(e) + "\n" + str(traceback.format_exc()))
Ejemplo n.º 5
0
    def run(self):
        global TOTAL_IMAGE_COUNT
        global TOTAL_VIDEO_COUNT

        account_id = self.account_info[0]

        try:
            log.step(account_id + " 开始")

            # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_id)
                video_path = os.path.join(VIDEO_TEMP_PATH, account_id)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id)
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_id)

            page_count = 1
            image_count = 1
            video_count = 1
            first_post_id = ""
            unique_list = []
            is_over = False
            need_make_image_dir = True
            need_make_video_dir = True
            while not is_over:
                post_url_list = get_one_page_post_url_list(account_id, page_count)
                if post_url_list is None:
                    log.error(account_id + " 无法访问第%s页相册页" % page_count)
                    tool.process_exit()

                if len(post_url_list) == 0:
                    # 下载完毕了
                    break

                log.trace(account_id + " 相册第%s页获取的所有信息页:%s" % (page_count, post_url_list))
                post_url_list_group_by_post_id = filter_post_url(post_url_list)
                log.trace(account_id + " 相册第%s页去重排序后的信息页:%s" % (page_count, post_url_list_group_by_post_id))
                log.step(account_id + " 相册第%s页获取到%s页信息页" % (page_count, len(post_url_list_group_by_post_id)))
                for post_id in sorted(post_url_list_group_by_post_id.keys(), reverse=True):
                    # 检查信息页id是否小于上次的记录
                    if post_id <= self.account_info[3]:
                        is_over = True
                        break

                    # 将第一个信息页的id做为新的存档记录
                    if first_post_id == "":
                        first_post_id = post_id

                    # 获取信息页并截取head标签内的内容
                    post_url = "http://%s.tumblr.com/post/%s" % (account_id, post_id)
                    post_page_head = get_post_page_head(post_url, post_url_list_group_by_post_id[post_id])
                    if post_page_head is None:
                        log.error(account_id + " 无法访问信息页 %s" % post_url)
                        continue
                    if not post_page_head:
                        log.error(account_id + " 信息页 %s 截取head标签异常" % post_url)
                        continue

                    # 获取og_type(页面类型的是视频还是图片或其他)
                    og_type = tool.find_sub_string(post_page_head, '<meta property="og:type" content="', '" />')
                    if not og_type:
                        log.error(account_id + " 信息页 %s,'og:type'获取异常" % post_url)
                        continue

                    # 空、音频、引用,跳过
                    if og_type in ["tumblr-feed:entry", "tumblr-feed:audio", "tumblr-feed:quote", "tumblr-feed:link"]:
                        continue

                    # 新增信息页导致的重复判断
                    if post_id in unique_list:
                        continue
                    else:
                        unique_list.append(post_id)

                    # 视频下载
                    if IS_DOWNLOAD_VIDEO and og_type == "tumblr-feed:video":
                        video_list = get_video_list(account_id, post_id)
                        if video_list is None:
                            log.error(account_id + " 第%s个视频 日志id:%s无法访问播放页" % (video_count, post_id))
                        else:
                            if len(video_list) > 0:
                                for video_url, video_type in list(video_list):
                                    log.step(account_id + " 开始下载第%s个视频 %s" % (video_count, video_url))

                                    # 第一个视频,创建目录
                                    if need_make_video_dir:
                                        if not tool.make_dir(video_path, 0):
                                            log.error(account_id + " 创建视频下载目录 %s 失败" % video_path)
                                            tool.process_exit()
                                        need_make_video_dir = False

                                    file_type = video_type.split("/")[-1]
                                    video_file_path = os.path.join(video_path, "%04d.%s" % (video_count, file_type))
                                    if tool.save_net_file(video_url, video_file_path):
                                        log.step(account_id + " 第%s个视频下载成功" % video_count)
                                        video_count += 1
                                    else:
                                        log.error(account_id + " 第%s个视频 %s 下载失败" % (video_count, video_url))
                            else:
                                log.error(account_id + " 第%s个视频 日志id:%s 中没有找到视频" % (video_count, post_id))

                    # 图片下载
                    if IS_DOWNLOAD_IMAGE:
                        if og_type == "tumblr-feed:video":
                            page_image_url_list = []
                            video_image_url = tool.find_sub_string(post_page_head, '<meta property="og:image" content="', '" />')
                            if video_image_url:
                                page_image_url_list.append(video_image_url)
                        else:
                            page_image_url_list = re.findall('"(http[s]?://\w*[.]?media.tumblr.com/[^"]*)"', post_page_head)
                            log.trace(account_id + " 信息页 %s 过滤前的所有图片:%s" % (post_url, page_image_url_list))
                            # 过滤头像以及页面上找到不同分辨率的同一张图
                            page_image_url_list = filter_different_resolution_images(page_image_url_list)
                        log.trace(account_id + " 信息页 %s 获取的的所有图片:%s" % (post_url, page_image_url_list))
                        if len(page_image_url_list) > 0:
                            for image_url in page_image_url_list:
                                log.step(account_id + " 开始下载第%s张图片 %s" % (image_count, image_url))

                                # 第一张图片,创建目录
                                if need_make_image_dir:
                                    if not tool.make_dir(image_path, 0):
                                        log.error(account_id + " 创建图片下载目录 %s 失败" % image_path)
                                        tool.process_exit()
                                    need_make_image_dir = False

                                file_type = image_url.split(".")[-1]
                                image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                                if tool.save_net_file(image_url, image_file_path):
                                    log.step(account_id + " 第%s张图片下载成功" % image_count)
                                    image_count += 1
                                else:
                                    log.error(account_id + " 第%s张图片 %s 下载失败" % (image_count, image_url))
                        else:
                            log.error(account_id + " 第%s张图片 信息页 %s 中没有找到图片" % (image_count, post_url))

                if not is_over:
                    # 达到配置文件中的下载数量,结束
                    if 0 < GET_PAGE_COUNT <= page_count:
                        is_over = True
                    else:
                        page_count += 1

            log.step(account_id + " 下载完毕,总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1))

            # 排序
            if IS_SORT:
                if image_count > 1:
                    destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id)
                    if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                        log.step(account_id + " 图片从下载目录移动到保存目录成功")
                    else:
                        log.error(account_id + " 创建图片保存目录 %s 失败" % destination_path)
                        tool.process_exit()
                if video_count > 1:
                    destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_id)
                    if robot.sort_file(video_path, destination_path, int(self.account_info[2]), 4):
                        log.step(account_id + " 视频从下载目录移动到保存目录成功")
                    else:
                        log.error(account_id + " 创建视频保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            # 新的存档记录
            if first_post_id != "":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = str(int(self.account_info[2]) + video_count - 1)
                self.account_info[3] = first_post_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_id + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_id + " 提前退出")
            else:
                log.error(account_id + " 异常退出")
Ejemplo n.º 6
0
    def run(self):
        global TOTAL_IMAGE_COUNT
        global TOTAL_VIDEO_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 6 and self.account_info[5]:
            account_name = self.account_info[5]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
                video_path = os.path.join(VIDEO_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)

            # 视频
            video_count = 1
            account_page_id = None
            first_video_url = ""
            is_over = False
            need_make_video_dir = True
            since_id = INIT_SINCE_ID
            while IS_DOWNLOAD_VIDEO and (not is_over):
                # 获取page_id
                if account_page_id is None:
                    account_page_id = get_account_page_id(account_id)
                    if account_page_id is None:
                        log.error(account_name + " 微博主页没有获取到page_id")
                        break

                # 获取指定时间点后的一页视频信息
                video_page_data = get_one_page_video_data(account_page_id, since_id)
                if video_page_data is None:
                    log.error(account_name + " 视频列表解析异常")
                    first_video_url = ""  # 存档恢复
                    break

                # 匹配获取全部的视频页面
                video_play_url_list = get_video_play_url_list(video_page_data)
                log.trace(account_name + "since_id:%s中的全部视频:%s" % (since_id, video_play_url_list))
                for video_play_url in video_play_url_list:
                    # 检查是否是上一次的最后视频
                    if self.account_info[4] == video_play_url:
                        is_over = True
                        break

                    # 将第一个视频的地址做为新的存档记录
                    if first_video_url == "":
                        first_video_url = video_play_url

                    # 获取这个视频的下载地址
                    return_code, video_url_list = get_video_url(video_play_url)
                    if return_code != 1:
                        if return_code == -1:
                            log.error(account_name + " 第%s个视频 %s 没有获取到源地址" % (video_count, video_play_url))
                        elif return_code == -2:
                            log.error(account_name + " 第%s个视频 %s 无法访问" % (video_count, video_play_url))
                        elif return_code == -3:
                            log.error(account_name + " 第%s个视频 %s 暂不支持的视频源" % (video_count, video_play_url))
                        continue
                    log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_play_url))

                    # 第一个视频,创建目录
                    if need_make_video_dir:
                        if not tool.make_dir(video_path, 0):
                            log.error(account_name + " 创建图片下载目录 %s 失败" % video_path)
                            tool.process_exit()
                        need_make_video_dir = False

                    video_file_path = os.path.join(video_path, "%04d.mp4" % video_count)
                    for video_url in video_url_list:
                        if tool.save_net_file(video_url, video_file_path):
                            log.step(account_name + " 第%s个视频下载成功" % video_count)
                            video_count += 1
                        else:
                            log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url))

                    # 达到配置文件中的下载数量,结束
                    if 0 < GET_VIDEO_COUNT < video_count:
                        is_over = True
                        break

                if not is_over:
                    # 获取下一页的since_id
                    since_id = tool.find_sub_string(video_page_data, "type=video&owner_uid=&since_id=", '">')
                    if not since_id:
                        break

            # 有历史记录,并且此次没有获得正常结束的标记,说明历史最后的视频已经被删除了
            if self.account_info[4] != "" and video_count > 1 and not is_over:
                log.error(account_name + " 没有找到上次下载的最后一个视频地址")

            # 图片
            image_count = 1
            page_count = 1
            first_image_time = "0"
            unique_list = []
            is_over = False
            need_make_image_dir = True
            while IS_DOWNLOAD_IMAGE and (not is_over):
                # 获取指定一页图片的信息
                photo_page_data = get_one_page_photo_data(account_id, page_count)
                if photo_page_data is None:
                    log.error(account_name + " 图片列表获取失败")
                    first_image_time = "0"  # 存档恢复
                    break

                log.trace(account_name + "第%s页的全部图片信息:%s" % (page_count, photo_page_data))
                for image_info in photo_page_data["photo_list"]:
                    if not robot.check_sub_key(("pic_host", "pic_name", "timestamp"), image_info):
                        log.error(account_name + " 第%s张图片信息解析失败 %s" % (image_count, image_info))
                        continue

                    # 检查是否图片时间小于上次的记录
                    if int(image_info["timestamp"]) <= int(self.account_info[2]):
                        is_over = True
                        break

                    # 新增图片导致的重复判断
                    if image_info["pic_name"] in unique_list:
                        continue
                    else:
                        unique_list.append(image_info["pic_name"])
                    # 将第一张图片的上传时间做为新的存档记录
                    if first_image_time == "0":
                        first_image_time = str(image_info["timestamp"])

                    image_url = str(image_info["pic_host"]) + "/large/" + str(image_info["pic_name"])
                    log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url))

                    # 获取图片的二进制数据,并且判断这个图片是否是可用的
                    image_status, image_byte = get_image_byte(image_url)
                    if image_status != 1:
                        if image_status == -1:
                            log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url))
                        elif image_status == -2:
                            log.error(account_name + " 第%s张图片 %s 资源已被删除,跳过" % (image_count, image_url))
                        continue

                    # 第一张图片,创建目录
                    if need_make_image_dir:
                        if not tool.make_dir(image_path, 0):
                            log.error(account_name + " 创建图片下载目录 %s 失败" % image_path)
                            tool.process_exit()
                        need_make_image_dir = False

                    file_type = image_url.split(".")[-1]
                    if file_type.find("/") != -1:
                        file_type = "jpg"
                    image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                    save_image(image_byte, image_file_path)
                    log.step(account_name + " 第%s张图片下载成功" % image_count)
                    image_count += 1

                    # 达到配置文件中的下载数量,结束
                    if 0 < GET_IMAGE_COUNT < image_count:
                        is_over = True
                        break

                if not is_over:
                    # 根据总的图片数量和每页显示的图片数量,计算是否还有下一页
                    if (photo_page_data["total"] / IMAGE_COUNT_PER_PAGE) > (page_count - 1):
                        page_count += 1
                    else:
                        # 全部图片下载完毕
                        is_over = True

            log.step(account_name + " 下载完毕,总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1))

            # 排序
            if IS_SORT:
                if first_image_time != "0":
                    destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                        log.step(account_name + " 图片从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path)
                        tool.process_exit()
                if first_video_url != "":
                    destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name)
                    if robot.sort_file(video_path, destination_path, int(self.account_info[3]), 4):
                        log.step(account_name + " 视频从下载目录移动到保存目录成功")
                    else:
                        log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path)
                        tool.process_exit()

            # 新的存档记录
            if first_image_time != "0":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = first_image_time
            if first_video_url != "":
                self.account_info[3] = str(int(self.account_info[3]) + video_count - 1)
                self.account_info[4] = first_video_url

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            TOTAL_VIDEO_COUNT += video_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")
Ejemplo n.º 7
0
    def run(self):
        global TOTAL_IMAGE_COUNT

        account_id = self.account_info[0]

        try:
            log.step(account_id + " 开始")

            # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_id)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id)

            # 图片下载
            page_count = 1
            image_count = 1
            first_post_id = ""
            unique_list = []
            is_over = False
            need_make_download_dir = True
            while not is_over:
                post_url_list = get_one_page_post_url_list(account_id, page_count)
                # 无法获取信息首页
                if post_url_list is None:
                    log.error(account_id + " 无法访问第%s页相册页" % page_count)
                    tool.process_exit()

                if len(post_url_list) == 0:
                    # 下载完毕了
                    break

                # 去重排序
                log.trace(account_id + " 相册第%s页获取的所有信息页:%s" % (page_count, post_url_list))
                post_url_list = sorted(list(set(post_url_list)), reverse=True)
                log.trace(account_id + " 相册第%s页去重排序后的信息页:%s" % (page_count, post_url_list))
                for post_url in post_url_list:
                    post_id = post_url.split("/")[-1].split("_")[-1]

                    # 检查是否已下载到前一次的图片
                    if post_id <= self.account_info[2]:
                        is_over = True
                        break

                    # 新增信息页导致的重复判断
                    if post_id in unique_list:
                        continue
                    else:
                        unique_list.append(post_id)
                    # 将第一个信息页的id做为新的存档记录
                    if first_post_id == "":
                        first_post_id = post_id

                    post_page_return_code, post_page = tool.http_request(post_url)[:2]
                    if post_page_return_code != 1:
                        log.error(account_id + " 第%s张图片,无法获取信息页 %s" % (image_count, post_url))
                        continue

                    image_url_list = get_image_url_list(post_page)
                    log.trace(account_id + " 信息页 %s 获取的所有图片:%s" % (post_url, image_url_list))
                    if len(image_url_list) == 0:
                        log.error(account_id + " 第%s张图片,信息页 %s 中没有找到图片" % (image_count, post_url))
                        continue
                    for image_url in image_url_list:
                        if image_url.rfind("?") > image_url.rfind("."):
                            image_url = image_url.split("?")[0]
                        log.step(account_id + " 开始下载第%s张图片 %s" % (image_count, image_url))

                        # 第一张图片,创建目录
                        if need_make_download_dir:
                            if not tool.make_dir(image_path, 0):
                                log.error(account_id + " 创建图片下载目录 %s 失败" % image_path)
                                tool.process_exit()
                            need_make_download_dir = False

                        file_type = image_url.split(".")[-1]
                        file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                        if tool.save_net_file(image_url, file_path):
                            log.step(account_id + " 第%s张图片下载成功" % image_count)
                            image_count += 1
                        else:
                            log.error(account_id + " 第%s张图片 %s 下载失败" % (image_count, image_url))

                        # 达到配置文件中的下载数量,结束
                        if 0 < GET_IMAGE_COUNT < image_count:
                            is_over = True
                            break

                    if is_over:
                        break

                if not is_over:
                    # 达到配置文件中的下载数量,结束
                    if 0 < GET_PAGE_COUNT <= page_count:
                        is_over = True
                    else:
                        page_count += 1

            log.step(account_id + " 下载完毕,总共获得%s张图片" % (image_count - 1))

            # 排序
            if IS_SORT and image_count > 1:
                destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id)
                if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                    log.step(account_id + " 图片从下载目录移动到保存目录成功")
                else:
                    log.error(account_id + " 创建图片子目录 %s 失败" % destination_path)
                    tool.process_exit()

            # 新的存档记录
            if first_post_id != "":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = first_post_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_id + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_id + " 提前退出")
            else:
                log.error(account_id + " 异常退出")
Ejemplo n.º 8
0
def get_genomes(genome_id,
                genome_region,
                gene_ids,
                reverse_complement=True,
                entrez_mail='*****@*****.**',
                force=False):
    Entrez.email = entrez_mail
    chromosome, start, end = genome_region
    # NCBI uses 1 based indexing and closed intervals [a,b]
    handle = Entrez.efetch(db='nucleotide',
                           id=genome_id,
                           rettype='fasta',
                           strand=1,
                           seq_start=start + 1,
                           seq_stop=end + 1 + 1)
    record = SeqIO.read(handle, 'fasta')
    hg19 = record.seq

    genomes = {}
    handle = Entrez.read(
        Entrez.esearch(db='nucleotide',
                       term=' '.join(g[1] for g in gene_ids),
                       retmode='xml'))
    for gi, gid in enumerate(handle['IdList']):
        params = {}
        if len(gene_ids[gi]) > 2:
            params = gene_ids[gi][2]
        genome = Entrez.efetch(db='nucleotide',
                               id=gid,
                               rettype='gb',
                               retmode='text',
                               **params).read()
        genome = SeqIO.read(StringIO(genome), 'genbank')

        if reverse_complement:
            genome.seq = genome.seq.reverse_complement()
        alignment = blat(hg19, genome.seq)

        log.trace('NCBI: Gene {} BLAT results: hit {}, query {}', genome.id,
                  alignment.hit_range, alignment.query_range)
        translation = dict(
            (i[0], i[1] + start) for f in alignment
            for i in zip(range(*f.query_range), range(*f.hit_range)))
        cds = [c for c in genome.features if c.type == 'CDS']
        if len(cds) == 0:
            cds = [c for c in genome.features if c.type == 'misc_RNA']
        for cd in cds:
            protein = ''
            if 'translation' in cd.qualifiers:
                protein = cd.qualifiers['translation']

            if reverse_complement:
                exons = [
                    SeqFeature.FeatureLocation(
                        len(genome.seq) - e.end,
                        len(genome.seq) - e.start, 1)
                    for e in cd.location.parts
                ]
                introns = [
                    SeqFeature.FeatureLocation(e2.end, e1.start, 1)
                    for e1, e2 in zip(exons[:-1], exons[1:])
                ]
            else:
                exons = [
                    SeqFeature.FeatureLocation(e.start, e.end, 1)
                    for e in cd.location.parts
                ]
                introns = [
                    SeqFeature.FeatureLocation(e1.end, e2.start, 1)
                    for e1, e2 in zip(exons[:-1], exons[1:])
                ]

            genomes[cd.qualifiers['gene'][0]] = Gene(
                name=cd.qualifiers['gene'][0],
                protein=protein,
                introns=introns,
                exons=exons,
                seq=genome.seq,
                translation=translation,
                pseudo_mutations={},
                pseudo_translation={},
                special_regions={})

    if len(gene_ids) > 1:
        g, p = gene_ids[0][0], gene_ids[1][0]
        p, pt = get_pseudo_mutations(genomes[g], genomes[p], force)
        genomes[g].pseudo_mutations.update(p)
        genomes[g].pseudo_translation.update(pt)

    return genomes, hg19
Ejemplo n.º 9
0
    def run(self):
        global TOTAL_IMAGE_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 4 and self.account_info[3]:
            account_name = self.account_info[3]
        else:
            account_name = self.account_info[0]
        if len(self.account_info) >= 5 and self.account_info[4]:
            account_file_path = self.account_info[4]
        else:
            account_file_path = ""

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_file_path, account_name)

            # 图片下载
            image_count = 1
            key = ""
            first_album_id = "0"
            unique_list = []
            is_over = False
            need_make_download_dir = True
            while not is_over:
                # 获取一页相册
                album_page = get_one_page_album(account_id, key)
                if album_page is None:
                    log.error(account_name + " 无法访问相册页,token:%s" % key)
                    tool.process_exit()

                # 获取相册页中的所有picasweb地址列表
                picasaweb_url_list = get_picasaweb_url_list(album_page)

                log.trace(account_name + " 相册获取的所有picasaweb页:%s" % picasaweb_url_list)
                for picasaweb_url in picasaweb_url_list:
                    # 有可能拿到带authkey的,需要去掉
                    # https://picasaweb.google.com/116300481938868290370/2015092603?authkey\u003dGv1sRgCOGLq-jctf-7Ww#6198800191175756402
                    picasaweb_url = picasaweb_url.replace("\u003d", "=")

                    # 获取picasaweb页的album id
                    album_id = get_picasaweb_page_album_id(account_id, picasaweb_url)
                    if album_id is None:
                        log.error(account_name + " 第%s张图片,无法访问picasaweb页 %s" % (image_count, picasaweb_url))
                        continue
                    if not album_id:
                        log.error(account_name + " 第%s张图片,picasaweb页 %s 获取album id失败" % (image_count, picasaweb_url))
                        continue
                    log.trace(account_name + " picasaweb页 %s 的album id:%s" % (picasaweb_url, album_id))

                    # 检查是否已下载到前一次的图片
                    if int(album_id) <= int(self.account_info[2]):
                        is_over = True
                        break

                    # # 相同的album_id判断
                    if album_id in unique_list:
                        continue
                    else:
                        unique_list.append(album_id)
                    # 将第一个album_id做为新的存档记录
                    if first_album_id == "0":
                        first_album_id = album_id

                    # 获取album id对应相册存档页的全部图片地址列表
                    image_url_list = get_image_url_list(account_id, album_id)
                    if image_url_list is None:
                        log.error(account_name + " 第%s张图片,无法访问album id:%s 的相册存档页" % (image_count, album_id))
                        continue
                    if len(image_url_list) == 0:
                        log.error(account_name + " 第%s张图片,album id:%s 的相册存档页没有解析到图片" % (image_count, album_id))
                        continue

                    log.trace(account_name + " album id:%s 的相册存档页获取的所有图片:%s" % (album_id, image_url_list))
                    for image_url in list(image_url_list):
                        image_url = generate_max_resolution_image_url(image_url)

                        # 下载
                        log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url))

                        # 第一张图片,创建目录
                        if need_make_download_dir:
                            if not tool.make_dir(image_path, 0):
                                log.error(account_name + " 创建图片下载目录 %s 失败" % image_path)
                                tool.process_exit()
                            need_make_download_dir = False

                        if image_url.rfind("/") < image_url.rfind("."):
                            file_type = image_url.split(".")[-1]
                        else:
                            file_type = "jpg"
                        file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                        if tool.save_net_file(image_url, file_path):
                            log.step(account_name + " 第%s张图片下载成功" % image_count)
                            image_count += 1
                        else:
                            log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url))

                        # 达到配置文件中的下载数量,结束
                        if 0 < GET_IMAGE_COUNT < image_count:
                            is_over = True
                            break
                    if is_over:
                        break

                if not is_over:
                    # 查找下一页的token key
                    key_find = re.findall('"([.]?[a-zA-Z0-9-_]*)"', album_page)
                    if len(key_find) > 0 and len(key_find[0]) > 80:
                        key = key_find[0]
                    else:
                        # 不是第一次下载
                        if self.account_info[2] != "0":
                            log.error(account_name + " 没有找到下一页的token,将该页保存:")
                            log.error(album_page)
                        is_over = True

            log.step(account_name + " 下载完毕,总共获得%s张图片" % (image_count - 1))

            # 排序
            if IS_SORT and image_count > 1:
                destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_file_path, account_name)
                if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                    log.step(account_name + " 图片从下载目录移动到保存目录成功")
                else:
                    log.error(account_name + " 创建图片子目录 %s 失败" % destination_path)
                    tool.process_exit()

            # 新的存档记录
            if first_album_id != "0":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = first_album_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")
Ejemplo n.º 10
0
    def run(self):
        global TOTAL_IMAGE_COUNT

        account_id = self.account_info[0]
        if len(self.account_info) >= 4 and self.account_info[3]:
            account_name = self.account_info[3]
        else:
            account_name = self.account_info[0]

        try:
            log.step(account_name + " 开始")

            # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录
            if IS_SORT:
                image_path = os.path.join(IMAGE_TEMP_PATH, account_name)
            else:
                image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)

            image_count = 1
            page_count = 1
            first_diary_id = "0"
            is_over = False
            need_make_image_dir = True
            while not is_over:
                # 获取一页博客信息
                diary_list = get_one_page_diary_data(account_id, page_count)
                if diary_list is None:
                    log.error(account_name + " 第%s页日志列表解析异常" % page_count)
                    tool.process_exit()

                # 没有获取到任何日志,所有日志已经全部获取完毕了
                if len(diary_list) == 0:
                    break

                for diary_info in list(diary_list):
                    # 日志id
                    diary_id = tool.find_sub_string(diary_info, "id=", "&")
                    if not diary_id:
                        log.error(account_name + " 日志id解析异常,日志信息:%s" % diary_info)
                        continue

                    # 检查是否是上一次的最后视频
                    if int(diary_id) <= int(self.account_info[2]):
                        is_over = True
                        break

                    # 将第一个日志的id做为新的存档记录
                    if first_diary_id == "0":
                        first_diary_id = diary_id

                    log.trace(account_name + " 日志id %s" % diary_id)

                    # 获取这个日志中的全部图片地址列表
                    image_url_list = get_image_url_list(diary_info)
                    for image_url in image_url_list:
                        # 如果图片地址没有域名,表示直接使用当前域名下的资源,需要拼接成完整的地址
                        if image_url[:7] != "http://" and image_url[:8] != "https://":
                            if image_url[0] == "/":
                                image_url = "http://www.keyakizaka46.com%s" % image_url
                            else:
                                image_url = "http://www.keyakizaka46.com/%s" % image_url

                        log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url))

                        # 第一张图片,创建目录
                        if need_make_image_dir:
                            if not tool.make_dir(image_path, 0):
                                log.error(account_name + " 创建图片下载目录 %s 失败" % image_path)
                                tool.process_exit()
                            need_make_image_dir = False

                        file_type = image_url.split(".")[-1]
                        file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type))
                        if tool.save_net_file(image_url, file_path):
                            log.step(account_name + " 第%s张图片下载成功" % image_count)
                            image_count += 1
                        else:
                            log.error(account_name + " 第%s张图片 %s 获取失败" % (image_count, image_url))

                    # 达到配置文件中的下载数量,结束
                    if 0 < GET_IMAGE_COUNT < image_count:
                        is_over = True
                        break

                if not is_over:
                    # 达到配置文件中的下载页数,结束
                    if 0 < GET_PAGE_COUNT <= page_count:
                        is_over = True
                    else:
                        page_count += 1

            log.step(account_name + " 下载完毕,总共获得%s张图片" % (image_count - 1))

            # 排序
            if IS_SORT and image_count > 1:
                destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name)
                if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4):
                    log.step(account_name + " 图片从下载目录移动到保存目录成功")
                else:
                    log.error(account_name + " 创建图片子目录 %s 失败" % destination_path)
                    tool.process_exit()

            # 新的存档记录
            if first_diary_id != "0":
                self.account_info[1] = str(int(self.account_info[1]) + image_count - 1)
                self.account_info[2] = first_diary_id

            # 保存最后的信息
            tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH)
            self.thread_lock.acquire()
            TOTAL_IMAGE_COUNT += image_count - 1
            ACCOUNTS.remove(account_id)
            self.thread_lock.release()

            log.step(account_name + " 完成")
        except SystemExit, se:
            if se.code == 0:
                log.step(account_name + " 提前退出")
            else:
                log.error(account_name + " 异常退出")