def trace(self, message, include_display_name=True): """ trace log """ if include_display_name and self.display_name is not None: message = self.display_name + " " + message log.trace(message)
def main(self): page_count = 1 image_count = 0 main_thread_count = threading.activeCount() # 多线程下载类型 # 1 同时开始N个下载线程 # 2 对一页中的所有图片开启多线程下载,下完一页中的所有图片后开始下一页 thread_type = 2 while True: # 获取一页页面 page_data = get_one_page_data(page_count) if page_data is None: log.error("第%s页获取失败" % page_count) break # 获取页面中的所有图片信息列表 image_info_list = re.findall('<img src="" data-original="([^"]*)" class="lazy img" title="([^"]*)">', page_data) # 获取页面中的影片数量 page_data_count = page_data.count('<div class="item pull-left">') # 已经下载完毕了 if page_data_count == 0: break log.step("第%s页,影片数量%s,获取到的封面图片数量%s" % (page_count, len(image_info_list), page_data_count)) for small_image_url, title in image_info_list: # 达到线程上限,等待 while thread_type == 1 and threading.activeCount() >= self.thread_count + main_thread_count: time.sleep(5) title = robot.filter_text(str(title)).upper() image_url = get_large_image_url(small_image_url) if image_url is None: log.trace("%s的封面图片大图地址获取失败" % title) continue log.step("开始下载%s的封面图片 %s" % (title, image_url)) file_type = image_url.split(".")[-1] file_path = os.path.join(self.image_download_path, "%s.%s" % (title, file_type)) file_temp_path = os.path.join(self.image_download_path, "%s_temp.%s" % (title, file_type)) # 开始下载 thread = Download(self.thread_lock, title, file_path, file_temp_path, image_url) thread.start() time.sleep(0.1) # 还有未完成线程 while thread_type == 2 and threading.activeCount() > main_thread_count: time.sleep(5) page_count += 1 log.step("全部下载完毕,耗时%s秒,共计图片%s张" % (self.get_run_time(), TOTAL_IMAGE_COUNT))
def get_dbsnp(data, region, force=False): mv = myvariant.MyVariantInfo() q = mv.query( '_exists_:dbsnp AND _exists_:hg19 AND {}:{}-{}'.format(*region), fields='dbsnp', fetch_all=True) snps = list(q) # VCF, dbSNP and myVariant use 1-based indexing dbsnp = collections.defaultdict(dict) for snp in snps: pos, ref, alt, rs = snp['dbsnp']['hg19']['start'] - 1, snp['dbsnp'][ 'ref'], snp['dbsnp']['alt'], snp['dbsnp']['rsid'] if len(ref) > 1 or len(alt) > 1: assert (ref[0] == alt[0]) if len(ref) > 1: op = 'DEL.{}'.format(ref[1:]) elif len(alt) > 1: op = 'INS.{}'.format(alt[1:].lower()) else: op = 'SNP.{}{}'.format(ref, alt) dbsnp[pos][op] = rs mutations = {} for a in sorted(data): for m in data[a]['mutations']: if m['pos'] == 'pseudogene': continue if m['dbsnp'] not in ['', '*']: m['dbsnp'] = [m['dbsnp']] else: m['dbsnp'] = [] pos, op = m['pos'], m['op'] # check reversed SNP if op in dbsnp[pos]: rsid = str(dbsnp[pos][op]) if rsid not in m['dbsnp']: if len(m['dbsnp']) > 0: m['dbsnp'][0] += '(k)' m['dbsnp'].append(rsid) log.debug('dbSNP: Variant {} assigned to {}:{}', rsid, pos, op) else: log.debug( 'dbSNP: Variant {} matches the Karolinska\'s prediction', rsid) elif len(dbsnp[pos]) > 0 and (op[:3] == 'SNP' and op[:4] + op[4:6][::-1] in dbsnp[pos]): op = op[:4] + op[4:6][::-1] rsid = str(dbsnp[pos][op]) if rsid not in m['dbsnp']: if len(m['dbsnp']) > 0: m['dbsnp'][0] += '(k)' m['dbsnp'].append(rsid) log.debug('dbSNP: Variant {} assigned to {}:{}', rsid, pos, op) else: log.debug( 'dbSNP: Variant {} matches the Karolinska\'s prediction', rsid) elif len(dbsnp[pos]) != 0: log.trace('How about {} for {}:{} ({})', dbsnp[pos], pos, op, m['old']) return data
def run(self): global TOTAL_IMAGE_COUNT coser_id = self.account_info[0] if len(self.account_info) >= 3: cn = self.account_info[2] else: cn = self.account_info[0] try: log.step(cn + " 开始") image_path = os.path.join(IMAGE_DOWNLOAD_PATH, cn) # 图片下载 this_cn_total_image_count = 0 page_count = 1 total_rp_count = 1 first_rp_id = "" unique_list = [] is_over = False need_make_download_dir = True # 是否需要创建cn目录 while not is_over: # 获取一页的作品信息 post_page = get_one_page_post(coser_id, page_count) if post_page is None: log.error(cn + " 无法访问第%s页作品" % page_count) tool.process_exit() # 解析作品信息,获取所有的正片信息 cp_id, rp_list = get_rp_list(post_page) if cp_id is None: log.error(cn + " 第%s页作品解析异常" % page_count) tool.process_exit() for rp_id, title in rp_list.iteritems(): # 检查是否已下载到前一次的图片 if int(rp_id) <= int(self.account_info[1]): is_over = True break # 新增正片导致的重复判断 if rp_id in unique_list: continue else: unique_list.append(rp_id) # 将第一个作品的id做为新的存档记录 if first_rp_id == "": first_rp_id = rp_id log.trace("rp: " + rp_id) if need_make_download_dir: if not tool.make_dir(image_path, 0): log.error(cn + " 创建CN目录 %s 失败" % image_path) tool.process_exit() need_make_download_dir = False # 过滤标题中不支持的字符 title = robot.filter_text(title) if title: rp_path = os.path.join(image_path, "%s %s" % (rp_id, title)) else: rp_path = os.path.join(image_path, rp_id) if not tool.make_dir(rp_path, 0): # 目录出错,把title去掉后再试一次,如果还不行退出 log.error(cn + " 创建作品目录 %s 失败,尝试不使用title" % rp_path) rp_path = os.path.join(image_path, rp_id) if not tool.make_dir(rp_path, 0): log.error(cn + " 创建作品目录 %s 失败" % rp_path) tool.process_exit() # 获取正片页面内的所有图片地址列表 image_url_list = get_image_url_list(cp_id, rp_id) if image_url_list is None: log.error(cn + " 无法访问正片:%s,cp_id:%s" % (rp_id, cp_id)) continue if len(image_url_list) == 0 and IS_AUTO_FOLLOW: log.step(cn + " 检测到可能有私密作品且账号不是ta的粉丝,自动关注") if follow(coser_id): # 重新获取下正片页面内的所有图片地址列表 image_url_list = get_image_url_list(cp_id, rp_id) if len(image_url_list) == 0: log.error(cn + " 正片:%s没有任何图片,可能是你使用的账号没有关注ta,所以无法访问只对粉丝开放的私密作品,cp_id:%s" % (rp_id, cp_id)) continue image_count = 1 for image_url in list(image_url_list): # 禁用指定分辨率 image_url = "/".join(image_url.split("/")[0:-1]) log.step(cn + " %s 开始下载第%s张图片 %s" % (rp_id, image_count, image_url)) if image_url.rfind("/") < image_url.rfind("."): file_type = image_url.split(".")[-1] else: file_type = "jpg" file_path = os.path.join(rp_path, "%03d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, file_path): image_count += 1 log.step(cn + " %s 第%s张图片下载成功" % (rp_id, image_count)) else: log.error(cn + " %s 第%s张图片 %s 下载失败" % (rp_id, image_count, image_url)) this_cn_total_image_count += image_count - 1 if 0 < GET_PAGE_COUNT < total_rp_count: is_over = True break else: total_rp_count += 1 if not is_over: if page_count >= get_max_page_count(coser_id, post_page): is_over = True else: page_count += 1 log.step(cn + " 下载完毕,总共获得%s张图片" % this_cn_total_image_count) # 新的存档记录 if first_rp_id != "": self.account_info[1] = first_rp_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += this_cn_total_image_count ACCOUNTS.remove(coser_id) self.thread_lock.release() log.step(cn + " 完成") except SystemExit: log.error(cn + " 异常退出") except Exception, e: log.error(cn + " 未知异常") log.error(str(e) + "\n" + str(traceback.format_exc()))
def run(self): global TOTAL_IMAGE_COUNT global TOTAL_VIDEO_COUNT account_id = self.account_info[0] try: log.step(account_id + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_id) video_path = os.path.join(VIDEO_TEMP_PATH, account_id) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id) video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_id) page_count = 1 image_count = 1 video_count = 1 first_post_id = "" unique_list = [] is_over = False need_make_image_dir = True need_make_video_dir = True while not is_over: post_url_list = get_one_page_post_url_list(account_id, page_count) if post_url_list is None: log.error(account_id + " 无法访问第%s页相册页" % page_count) tool.process_exit() if len(post_url_list) == 0: # 下载完毕了 break log.trace(account_id + " 相册第%s页获取的所有信息页:%s" % (page_count, post_url_list)) post_url_list_group_by_post_id = filter_post_url(post_url_list) log.trace(account_id + " 相册第%s页去重排序后的信息页:%s" % (page_count, post_url_list_group_by_post_id)) log.step(account_id + " 相册第%s页获取到%s页信息页" % (page_count, len(post_url_list_group_by_post_id))) for post_id in sorted(post_url_list_group_by_post_id.keys(), reverse=True): # 检查信息页id是否小于上次的记录 if post_id <= self.account_info[3]: is_over = True break # 将第一个信息页的id做为新的存档记录 if first_post_id == "": first_post_id = post_id # 获取信息页并截取head标签内的内容 post_url = "http://%s.tumblr.com/post/%s" % (account_id, post_id) post_page_head = get_post_page_head(post_url, post_url_list_group_by_post_id[post_id]) if post_page_head is None: log.error(account_id + " 无法访问信息页 %s" % post_url) continue if not post_page_head: log.error(account_id + " 信息页 %s 截取head标签异常" % post_url) continue # 获取og_type(页面类型的是视频还是图片或其他) og_type = tool.find_sub_string(post_page_head, '<meta property="og:type" content="', '" />') if not og_type: log.error(account_id + " 信息页 %s,'og:type'获取异常" % post_url) continue # 空、音频、引用,跳过 if og_type in ["tumblr-feed:entry", "tumblr-feed:audio", "tumblr-feed:quote", "tumblr-feed:link"]: continue # 新增信息页导致的重复判断 if post_id in unique_list: continue else: unique_list.append(post_id) # 视频下载 if IS_DOWNLOAD_VIDEO and og_type == "tumblr-feed:video": video_list = get_video_list(account_id, post_id) if video_list is None: log.error(account_id + " 第%s个视频 日志id:%s无法访问播放页" % (video_count, post_id)) else: if len(video_list) > 0: for video_url, video_type in list(video_list): log.step(account_id + " 开始下载第%s个视频 %s" % (video_count, video_url)) # 第一个视频,创建目录 if need_make_video_dir: if not tool.make_dir(video_path, 0): log.error(account_id + " 创建视频下载目录 %s 失败" % video_path) tool.process_exit() need_make_video_dir = False file_type = video_type.split("/")[-1] video_file_path = os.path.join(video_path, "%04d.%s" % (video_count, file_type)) if tool.save_net_file(video_url, video_file_path): log.step(account_id + " 第%s个视频下载成功" % video_count) video_count += 1 else: log.error(account_id + " 第%s个视频 %s 下载失败" % (video_count, video_url)) else: log.error(account_id + " 第%s个视频 日志id:%s 中没有找到视频" % (video_count, post_id)) # 图片下载 if IS_DOWNLOAD_IMAGE: if og_type == "tumblr-feed:video": page_image_url_list = [] video_image_url = tool.find_sub_string(post_page_head, '<meta property="og:image" content="', '" />') if video_image_url: page_image_url_list.append(video_image_url) else: page_image_url_list = re.findall('"(http[s]?://\w*[.]?media.tumblr.com/[^"]*)"', post_page_head) log.trace(account_id + " 信息页 %s 过滤前的所有图片:%s" % (post_url, page_image_url_list)) # 过滤头像以及页面上找到不同分辨率的同一张图 page_image_url_list = filter_different_resolution_images(page_image_url_list) log.trace(account_id + " 信息页 %s 获取的的所有图片:%s" % (post_url, page_image_url_list)) if len(page_image_url_list) > 0: for image_url in page_image_url_list: log.step(account_id + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 第一张图片,创建目录 if need_make_image_dir: if not tool.make_dir(image_path, 0): log.error(account_id + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_image_dir = False file_type = image_url.split(".")[-1] image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, image_file_path): log.step(account_id + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_id + " 第%s张图片 %s 下载失败" % (image_count, image_url)) else: log.error(account_id + " 第%s张图片 信息页 %s 中没有找到图片" % (image_count, post_url)) if not is_over: # 达到配置文件中的下载数量,结束 if 0 < GET_PAGE_COUNT <= page_count: is_over = True else: page_count += 1 log.step(account_id + " 下载完毕,总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1)) # 排序 if IS_SORT: if image_count > 1: destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_id + " 图片从下载目录移动到保存目录成功") else: log.error(account_id + " 创建图片保存目录 %s 失败" % destination_path) tool.process_exit() if video_count > 1: destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_id) if robot.sort_file(video_path, destination_path, int(self.account_info[2]), 4): log.step(account_id + " 视频从下载目录移动到保存目录成功") else: log.error(account_id + " 创建视频保存目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_post_id != "": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = str(int(self.account_info[2]) + video_count - 1) self.account_info[3] = first_post_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_id + " 完成") except SystemExit, se: if se.code == 0: log.step(account_id + " 提前退出") else: log.error(account_id + " 异常退出")
def run(self): global TOTAL_IMAGE_COUNT global TOTAL_VIDEO_COUNT account_id = self.account_info[0] if len(self.account_info) >= 6 and self.account_info[5]: account_name = self.account_info[5] else: account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) video_path = os.path.join(VIDEO_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) video_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) # 视频 video_count = 1 account_page_id = None first_video_url = "" is_over = False need_make_video_dir = True since_id = INIT_SINCE_ID while IS_DOWNLOAD_VIDEO and (not is_over): # 获取page_id if account_page_id is None: account_page_id = get_account_page_id(account_id) if account_page_id is None: log.error(account_name + " 微博主页没有获取到page_id") break # 获取指定时间点后的一页视频信息 video_page_data = get_one_page_video_data(account_page_id, since_id) if video_page_data is None: log.error(account_name + " 视频列表解析异常") first_video_url = "" # 存档恢复 break # 匹配获取全部的视频页面 video_play_url_list = get_video_play_url_list(video_page_data) log.trace(account_name + "since_id:%s中的全部视频:%s" % (since_id, video_play_url_list)) for video_play_url in video_play_url_list: # 检查是否是上一次的最后视频 if self.account_info[4] == video_play_url: is_over = True break # 将第一个视频的地址做为新的存档记录 if first_video_url == "": first_video_url = video_play_url # 获取这个视频的下载地址 return_code, video_url_list = get_video_url(video_play_url) if return_code != 1: if return_code == -1: log.error(account_name + " 第%s个视频 %s 没有获取到源地址" % (video_count, video_play_url)) elif return_code == -2: log.error(account_name + " 第%s个视频 %s 无法访问" % (video_count, video_play_url)) elif return_code == -3: log.error(account_name + " 第%s个视频 %s 暂不支持的视频源" % (video_count, video_play_url)) continue log.step(account_name + " 开始下载第%s个视频 %s" % (video_count, video_play_url)) # 第一个视频,创建目录 if need_make_video_dir: if not tool.make_dir(video_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % video_path) tool.process_exit() need_make_video_dir = False video_file_path = os.path.join(video_path, "%04d.mp4" % video_count) for video_url in video_url_list: if tool.save_net_file(video_url, video_file_path): log.step(account_name + " 第%s个视频下载成功" % video_count) video_count += 1 else: log.error(account_name + " 第%s个视频 %s 下载失败" % (video_count, video_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_VIDEO_COUNT < video_count: is_over = True break if not is_over: # 获取下一页的since_id since_id = tool.find_sub_string(video_page_data, "type=video&owner_uid=&since_id=", '">') if not since_id: break # 有历史记录,并且此次没有获得正常结束的标记,说明历史最后的视频已经被删除了 if self.account_info[4] != "" and video_count > 1 and not is_over: log.error(account_name + " 没有找到上次下载的最后一个视频地址") # 图片 image_count = 1 page_count = 1 first_image_time = "0" unique_list = [] is_over = False need_make_image_dir = True while IS_DOWNLOAD_IMAGE and (not is_over): # 获取指定一页图片的信息 photo_page_data = get_one_page_photo_data(account_id, page_count) if photo_page_data is None: log.error(account_name + " 图片列表获取失败") first_image_time = "0" # 存档恢复 break log.trace(account_name + "第%s页的全部图片信息:%s" % (page_count, photo_page_data)) for image_info in photo_page_data["photo_list"]: if not robot.check_sub_key(("pic_host", "pic_name", "timestamp"), image_info): log.error(account_name + " 第%s张图片信息解析失败 %s" % (image_count, image_info)) continue # 检查是否图片时间小于上次的记录 if int(image_info["timestamp"]) <= int(self.account_info[2]): is_over = True break # 新增图片导致的重复判断 if image_info["pic_name"] in unique_list: continue else: unique_list.append(image_info["pic_name"]) # 将第一张图片的上传时间做为新的存档记录 if first_image_time == "0": first_image_time = str(image_info["timestamp"]) image_url = str(image_info["pic_host"]) + "/large/" + str(image_info["pic_name"]) log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 获取图片的二进制数据,并且判断这个图片是否是可用的 image_status, image_byte = get_image_byte(image_url) if image_status != 1: if image_status == -1: log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url)) elif image_status == -2: log.error(account_name + " 第%s张图片 %s 资源已被删除,跳过" % (image_count, image_url)) continue # 第一张图片,创建目录 if need_make_image_dir: if not tool.make_dir(image_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_image_dir = False file_type = image_url.split(".")[-1] if file_type.find("/") != -1: file_type = "jpg" image_file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) save_image(image_byte, image_file_path) log.step(account_name + " 第%s张图片下载成功" % image_count) image_count += 1 # 达到配置文件中的下载数量,结束 if 0 < GET_IMAGE_COUNT < image_count: is_over = True break if not is_over: # 根据总的图片数量和每页显示的图片数量,计算是否还有下一页 if (photo_page_data["total"] / IMAGE_COUNT_PER_PAGE) > (page_count - 1): page_count += 1 else: # 全部图片下载完毕 is_over = True log.step(account_name + " 下载完毕,总共获得%s张图片和%s个视频" % (image_count - 1, video_count - 1)) # 排序 if IS_SORT: if first_image_time != "0": destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片保存目录 %s 失败" % destination_path) tool.process_exit() if first_video_url != "": destination_path = os.path.join(VIDEO_DOWNLOAD_PATH, account_name) if robot.sort_file(video_path, destination_path, int(self.account_info[3]), 4): log.step(account_name + " 视频从下载目录移动到保存目录成功") else: log.error(account_name + " 创建视频保存目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_image_time != "0": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = first_image_time if first_video_url != "": self.account_info[3] = str(int(self.account_info[3]) + video_count - 1) self.account_info[4] = first_video_url # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 TOTAL_VIDEO_COUNT += video_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def run(self): global TOTAL_IMAGE_COUNT account_id = self.account_info[0] try: log.step(account_id + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_id) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id) # 图片下载 page_count = 1 image_count = 1 first_post_id = "" unique_list = [] is_over = False need_make_download_dir = True while not is_over: post_url_list = get_one_page_post_url_list(account_id, page_count) # 无法获取信息首页 if post_url_list is None: log.error(account_id + " 无法访问第%s页相册页" % page_count) tool.process_exit() if len(post_url_list) == 0: # 下载完毕了 break # 去重排序 log.trace(account_id + " 相册第%s页获取的所有信息页:%s" % (page_count, post_url_list)) post_url_list = sorted(list(set(post_url_list)), reverse=True) log.trace(account_id + " 相册第%s页去重排序后的信息页:%s" % (page_count, post_url_list)) for post_url in post_url_list: post_id = post_url.split("/")[-1].split("_")[-1] # 检查是否已下载到前一次的图片 if post_id <= self.account_info[2]: is_over = True break # 新增信息页导致的重复判断 if post_id in unique_list: continue else: unique_list.append(post_id) # 将第一个信息页的id做为新的存档记录 if first_post_id == "": first_post_id = post_id post_page_return_code, post_page = tool.http_request(post_url)[:2] if post_page_return_code != 1: log.error(account_id + " 第%s张图片,无法获取信息页 %s" % (image_count, post_url)) continue image_url_list = get_image_url_list(post_page) log.trace(account_id + " 信息页 %s 获取的所有图片:%s" % (post_url, image_url_list)) if len(image_url_list) == 0: log.error(account_id + " 第%s张图片,信息页 %s 中没有找到图片" % (image_count, post_url)) continue for image_url in image_url_list: if image_url.rfind("?") > image_url.rfind("."): image_url = image_url.split("?")[0] log.step(account_id + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 第一张图片,创建目录 if need_make_download_dir: if not tool.make_dir(image_path, 0): log.error(account_id + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_download_dir = False file_type = image_url.split(".")[-1] file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, file_path): log.step(account_id + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_id + " 第%s张图片 %s 下载失败" % (image_count, image_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_IMAGE_COUNT < image_count: is_over = True break if is_over: break if not is_over: # 达到配置文件中的下载数量,结束 if 0 < GET_PAGE_COUNT <= page_count: is_over = True else: page_count += 1 log.step(account_id + " 下载完毕,总共获得%s张图片" % (image_count - 1)) # 排序 if IS_SORT and image_count > 1: destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_id) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_id + " 图片从下载目录移动到保存目录成功") else: log.error(account_id + " 创建图片子目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_post_id != "": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = first_post_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_id + " 完成") except SystemExit, se: if se.code == 0: log.step(account_id + " 提前退出") else: log.error(account_id + " 异常退出")
def get_genomes(genome_id, genome_region, gene_ids, reverse_complement=True, entrez_mail='*****@*****.**', force=False): Entrez.email = entrez_mail chromosome, start, end = genome_region # NCBI uses 1 based indexing and closed intervals [a,b] handle = Entrez.efetch(db='nucleotide', id=genome_id, rettype='fasta', strand=1, seq_start=start + 1, seq_stop=end + 1 + 1) record = SeqIO.read(handle, 'fasta') hg19 = record.seq genomes = {} handle = Entrez.read( Entrez.esearch(db='nucleotide', term=' '.join(g[1] for g in gene_ids), retmode='xml')) for gi, gid in enumerate(handle['IdList']): params = {} if len(gene_ids[gi]) > 2: params = gene_ids[gi][2] genome = Entrez.efetch(db='nucleotide', id=gid, rettype='gb', retmode='text', **params).read() genome = SeqIO.read(StringIO(genome), 'genbank') if reverse_complement: genome.seq = genome.seq.reverse_complement() alignment = blat(hg19, genome.seq) log.trace('NCBI: Gene {} BLAT results: hit {}, query {}', genome.id, alignment.hit_range, alignment.query_range) translation = dict( (i[0], i[1] + start) for f in alignment for i in zip(range(*f.query_range), range(*f.hit_range))) cds = [c for c in genome.features if c.type == 'CDS'] if len(cds) == 0: cds = [c for c in genome.features if c.type == 'misc_RNA'] for cd in cds: protein = '' if 'translation' in cd.qualifiers: protein = cd.qualifiers['translation'] if reverse_complement: exons = [ SeqFeature.FeatureLocation( len(genome.seq) - e.end, len(genome.seq) - e.start, 1) for e in cd.location.parts ] introns = [ SeqFeature.FeatureLocation(e2.end, e1.start, 1) for e1, e2 in zip(exons[:-1], exons[1:]) ] else: exons = [ SeqFeature.FeatureLocation(e.start, e.end, 1) for e in cd.location.parts ] introns = [ SeqFeature.FeatureLocation(e1.end, e2.start, 1) for e1, e2 in zip(exons[:-1], exons[1:]) ] genomes[cd.qualifiers['gene'][0]] = Gene( name=cd.qualifiers['gene'][0], protein=protein, introns=introns, exons=exons, seq=genome.seq, translation=translation, pseudo_mutations={}, pseudo_translation={}, special_regions={}) if len(gene_ids) > 1: g, p = gene_ids[0][0], gene_ids[1][0] p, pt = get_pseudo_mutations(genomes[g], genomes[p], force) genomes[g].pseudo_mutations.update(p) genomes[g].pseudo_translation.update(pt) return genomes, hg19
def run(self): global TOTAL_IMAGE_COUNT account_id = self.account_info[0] if len(self.account_info) >= 4 and self.account_info[3]: account_name = self.account_info[3] else: account_name = self.account_info[0] if len(self.account_info) >= 5 and self.account_info[4]: account_file_path = self.account_info[4] else: account_file_path = "" try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_file_path, account_name) # 图片下载 image_count = 1 key = "" first_album_id = "0" unique_list = [] is_over = False need_make_download_dir = True while not is_over: # 获取一页相册 album_page = get_one_page_album(account_id, key) if album_page is None: log.error(account_name + " 无法访问相册页,token:%s" % key) tool.process_exit() # 获取相册页中的所有picasweb地址列表 picasaweb_url_list = get_picasaweb_url_list(album_page) log.trace(account_name + " 相册获取的所有picasaweb页:%s" % picasaweb_url_list) for picasaweb_url in picasaweb_url_list: # 有可能拿到带authkey的,需要去掉 # https://picasaweb.google.com/116300481938868290370/2015092603?authkey\u003dGv1sRgCOGLq-jctf-7Ww#6198800191175756402 picasaweb_url = picasaweb_url.replace("\u003d", "=") # 获取picasaweb页的album id album_id = get_picasaweb_page_album_id(account_id, picasaweb_url) if album_id is None: log.error(account_name + " 第%s张图片,无法访问picasaweb页 %s" % (image_count, picasaweb_url)) continue if not album_id: log.error(account_name + " 第%s张图片,picasaweb页 %s 获取album id失败" % (image_count, picasaweb_url)) continue log.trace(account_name + " picasaweb页 %s 的album id:%s" % (picasaweb_url, album_id)) # 检查是否已下载到前一次的图片 if int(album_id) <= int(self.account_info[2]): is_over = True break # # 相同的album_id判断 if album_id in unique_list: continue else: unique_list.append(album_id) # 将第一个album_id做为新的存档记录 if first_album_id == "0": first_album_id = album_id # 获取album id对应相册存档页的全部图片地址列表 image_url_list = get_image_url_list(account_id, album_id) if image_url_list is None: log.error(account_name + " 第%s张图片,无法访问album id:%s 的相册存档页" % (image_count, album_id)) continue if len(image_url_list) == 0: log.error(account_name + " 第%s张图片,album id:%s 的相册存档页没有解析到图片" % (image_count, album_id)) continue log.trace(account_name + " album id:%s 的相册存档页获取的所有图片:%s" % (album_id, image_url_list)) for image_url in list(image_url_list): image_url = generate_max_resolution_image_url(image_url) # 下载 log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 第一张图片,创建目录 if need_make_download_dir: if not tool.make_dir(image_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_download_dir = False if image_url.rfind("/") < image_url.rfind("."): file_type = image_url.split(".")[-1] else: file_type = "jpg" file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, file_path): log.step(account_name + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_name + " 第%s张图片 %s 下载失败" % (image_count, image_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_IMAGE_COUNT < image_count: is_over = True break if is_over: break if not is_over: # 查找下一页的token key key_find = re.findall('"([.]?[a-zA-Z0-9-_]*)"', album_page) if len(key_find) > 0 and len(key_find[0]) > 80: key = key_find[0] else: # 不是第一次下载 if self.account_info[2] != "0": log.error(account_name + " 没有找到下一页的token,将该页保存:") log.error(album_page) is_over = True log.step(account_name + " 下载完毕,总共获得%s张图片" % (image_count - 1)) # 排序 if IS_SORT and image_count > 1: destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_file_path, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片子目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_album_id != "0": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = first_album_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")
def run(self): global TOTAL_IMAGE_COUNT account_id = self.account_info[0] if len(self.account_info) >= 4 and self.account_info[3]: account_name = self.account_info[3] else: account_name = self.account_info[0] try: log.step(account_name + " 开始") # 如果需要重新排序则使用临时文件夹,否则直接下载到目标目录 if IS_SORT: image_path = os.path.join(IMAGE_TEMP_PATH, account_name) else: image_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) image_count = 1 page_count = 1 first_diary_id = "0" is_over = False need_make_image_dir = True while not is_over: # 获取一页博客信息 diary_list = get_one_page_diary_data(account_id, page_count) if diary_list is None: log.error(account_name + " 第%s页日志列表解析异常" % page_count) tool.process_exit() # 没有获取到任何日志,所有日志已经全部获取完毕了 if len(diary_list) == 0: break for diary_info in list(diary_list): # 日志id diary_id = tool.find_sub_string(diary_info, "id=", "&") if not diary_id: log.error(account_name + " 日志id解析异常,日志信息:%s" % diary_info) continue # 检查是否是上一次的最后视频 if int(diary_id) <= int(self.account_info[2]): is_over = True break # 将第一个日志的id做为新的存档记录 if first_diary_id == "0": first_diary_id = diary_id log.trace(account_name + " 日志id %s" % diary_id) # 获取这个日志中的全部图片地址列表 image_url_list = get_image_url_list(diary_info) for image_url in image_url_list: # 如果图片地址没有域名,表示直接使用当前域名下的资源,需要拼接成完整的地址 if image_url[:7] != "http://" and image_url[:8] != "https://": if image_url[0] == "/": image_url = "http://www.keyakizaka46.com%s" % image_url else: image_url = "http://www.keyakizaka46.com/%s" % image_url log.step(account_name + " 开始下载第%s张图片 %s" % (image_count, image_url)) # 第一张图片,创建目录 if need_make_image_dir: if not tool.make_dir(image_path, 0): log.error(account_name + " 创建图片下载目录 %s 失败" % image_path) tool.process_exit() need_make_image_dir = False file_type = image_url.split(".")[-1] file_path = os.path.join(image_path, "%04d.%s" % (image_count, file_type)) if tool.save_net_file(image_url, file_path): log.step(account_name + " 第%s张图片下载成功" % image_count) image_count += 1 else: log.error(account_name + " 第%s张图片 %s 获取失败" % (image_count, image_url)) # 达到配置文件中的下载数量,结束 if 0 < GET_IMAGE_COUNT < image_count: is_over = True break if not is_over: # 达到配置文件中的下载页数,结束 if 0 < GET_PAGE_COUNT <= page_count: is_over = True else: page_count += 1 log.step(account_name + " 下载完毕,总共获得%s张图片" % (image_count - 1)) # 排序 if IS_SORT and image_count > 1: destination_path = os.path.join(IMAGE_DOWNLOAD_PATH, account_name) if robot.sort_file(image_path, destination_path, int(self.account_info[1]), 4): log.step(account_name + " 图片从下载目录移动到保存目录成功") else: log.error(account_name + " 创建图片子目录 %s 失败" % destination_path) tool.process_exit() # 新的存档记录 if first_diary_id != "0": self.account_info[1] = str(int(self.account_info[1]) + image_count - 1) self.account_info[2] = first_diary_id # 保存最后的信息 tool.write_file("\t".join(self.account_info), NEW_SAVE_DATA_PATH) self.thread_lock.acquire() TOTAL_IMAGE_COUNT += image_count - 1 ACCOUNTS.remove(account_id) self.thread_lock.release() log.step(account_name + " 完成") except SystemExit, se: if se.code == 0: log.step(account_name + " 提前退出") else: log.error(account_name + " 异常退出")