def parse_paging(index, loop): """ 分页解析,同步方法 :param index: 页码 :param loop:是否循环 :return: """ if loop: response = HTTP.get(paging_url + str(index)) if response.status_code == 200: # 请求成功则解析页面内容 print("开始解析第", index, "页数据") with ThreadPoolExecutor(MAX_WORKS) as executor: selector = LXML.get_selector(response.content) for tag in selector.xpath("//a[@class='caption hidden-md hidden-sm hidden-xs']"): url = base_url + tag.get("href") title = tag.get("title").rstrip("Wallpaper").strip() if db_collection.find({"detail_url": url}).count() <= 0: executor.submit(parse_wallpaper_detail, url, title) else: loop = False executor.shutdown() print("该页面已解析,解析终止" if not loop else "") parse_paging(index + 1, loop) elif response.status_code == 403: # 如果为403则代表封ip了,需要终止访问 print("请求第", index, "页出现403,分页终止!") else: print("请求第", index, "页失败,", RETRY_INTERVAL, "秒后重试") time.sleep(RETRY_INTERVAL) parse_paging(index, loop) pass
def spider(wiki): response = HTTP.get( 'http://' + wiki + '.wikia.com/wikia.php', { 'controller': 'Chat', 'format': 'json', 'client': 'Chatserv', 'version': chatserv.version }, {'Cookie': chatserv.session} ).read().decode('utf-8') return json.loads(response)
def session(room, key = None, server = None, port = None): if room <= 0: raise Exception('Invalid room ' + room) if key == False: raise Exception('\'key\' is false') result = HTTP.get( 'http://' + server + ':' + str(port) + '/socket.io/1/', { 'name': chatserv.user, 'key': key, 'roomId': room, 'client': 'Chatserv', 'version': chatserv.version }, {'Cookie': chatserv.session} ).read().decode('utf-8') if result[:11] == 'new Error(\'': raise Exception(result[11:-2]) else: return result[:result.find(':')]
def parse_wallpaper_detail(url, title): """ 解析壁纸详情页面 :param url:详情页地址 :param title:壁纸名称 :return: """ model = HDQWallsModel.HDQWallsModel(time.time()) model.update_timestamp = time.time() model.detail_url = url model.title = title # 请求详情页 response = HTTP.get(url) if response.status_code == 200: selector = LXML.get_selector(response.content) model.author = LXML.get_first_attr_text(selector, "//a[@href and @target and @class]/i", "佚名").lstrip() model.author_link = LXML.get_first_attr(selector, "//a[@href and @target and @class]/i/..", "href", "") model.original_resolution = LXML.get_first_attr_text(selector, "//blockquote/footer/a[not(@style)]").lstrip() # 解析分类标签(仅英文) categories = [] for tag in selector.xpath("//div/ul/li[@id='tags']/../a/li/span"): category = tag.text.rstrip(",").rstrip("wallpapers").replace("-", " ").strip() # add_category_tag(category) categories.append(category) model.category_list = categories # model.category_list_cn = convert_category_tag(categories) # 解析原始文件信息 original_file = OriginalFileInfoModel.OriginalFileInfoModel() original_file.download_url = base_url + LXML.get_first_attr(selector, "//div[@class='wallpaper_container']/div/a[@rel='nofollow']", "href") original_file.file_name = os.path.basename(original_file.download_url) original_file.file_format = original_file.file_name[original_file.file_name.index(".") + 1:] model.original_file_info.update(original_file.__dict__) print("解析完成:", title, url) db_update(model) print("数据库写入完成") elif response.status_code == 403: print("ip被封禁了!请求终止!") else: print("请求", title, "失败,", RETRY_INTERVAL, "秒后重试") time.sleep(RETRY_INTERVAL) parse_wallpaper_detail(url, title) pass
def download_file(item_dict): """ 文件下载 :param item_dict: :return: """ try: original_file_info = item_dict["original_file_info"] download_url = original_file_info["download_url"] response = HTTP.get(download_url, use_proxy=True) if response.status_code == 200: file_format = original_file_info["file_format"] file_name = hashlib.md5(download_url.encode("gbk")).hexdigest() file_path = photos_cache_path + "/" + file_name + "." + file_format with open(file_path, "wb") as f: f.write(response.content) # 读取文件信息 Image.MAX_IMAGE_PIXELS = 1000000000 image = Image.open(file_path) original_file_info["width"] = image.width original_file_info["height"] = image.height original_file_info["file_size"] = os.path.getsize(file_path) # 信息赋值并写入数据库 item_dict["original_file_info"] = original_file_info db_collection.update({"detail_url": item_dict["detail_url"]}, item_dict, upsert=True) print("图片下载完成", download_url) elif response.status_code == 401: print("ip已被封禁,下载停止") else: print("下载失败", RETRY_INTERVAL, "秒后重试") time.sleep(RETRY_INTERVAL) download_url(item_dict) except: print("文件下载失败") download_file(item_dict)
def isloggedin(): return bool(json.loads(HTTP.get('http://community.wikia.com/api.php', {'action': 'query', 'meta': 'userinfo', 'format': 'json'}, {'Cookie': session}).read().decode('utf-8')))
def connect(sock): while True: response = HTTP.get( 'http://' + sock.server + ':' + sock.port + '/socket.io/1/xhr-polling/' + sock.session + '/', { 'name': chatserv.user, 'key': sock.key, 'roomId': sock.id, 'client': 'Chatserv', 'version': chatserv.version }, {'Cookie': chatserv.session}, timeout=30 ) if sock._Chat__killed.isSet(): break if response.status == 200: print('---------------------------------------') data = response.read().decode('utf-8') #noop is the single most common event (by far) and by definition cannot be sent with another message. #skipping out on the string functions and loop overhead most of the time should help supress context #switching overhead and GIL overhead as well if data == '8::': continue if data[0] != '\ufffd': data = '\ufffd' + str(len(data)) + '\ufffd' + data data = data.split('\ufffd') i = 1 #data[0] is an empty string while i < len(data): #sorry, but range() is stupid. Ain't no one got time for that if int(data[i]) != len(data[i + 1]): raise Exception('Message length mismatch') #TODO: ProtocolError message = data[i + 1] print(message) #we don't need this anymore, and 8:: causes a continue, so it's easier to increment here i += 2 #no switch, so these are in frequency order if message[0] == '4': #json event = json.loads(message[4:]) chatserv.stack.put(chatserv.StackContext(chatserv.io.receive, sock, event)) if event['event'] == 'disableReconnect': sock.connected.clear() return elif event['event'] == 'forceReconnect': pass #re auth and such elif message[0] == '8': #noop - just in case continue elif message[0] == '0': #disconnect sock.connected.clear() return elif message[0] == '1': #connect if sock.connected.is_set(): continue #why it sometimes spams 1:: is beyond me sock.connected.set() sock.sendCommand('initquery') elif message[0] == '7': #error sock.connected.clear() raise Exception(message[4:]) else: sock.connected.clear() raise Exception('Received unimplemented data type ' + message[0]) elif response.status == 404: continue #this is what Torus does, I still don't know if it's good or bad else: raise Exception('Bad HTTP status ' + response.status) sock.connected.clear()