def rt_parser(items): pids = get_pids(items) if not pids: log_with_time("got nothing: %s" % entries) return purl = price_url % (",".join(["J_" + i for i in pids]), random.randint(1000000, 10000000), int(time.time() * 1000)) surl = stock_url % (async_http.quote(",".join([i for i in pids])), random.randint(1000000, 10000000), int(time.time() * 1000)) price_res = simple_http.get(purl) stock_res = simple_http.get(surl) if price_res["status"] != 200 or stock_res["status"] != 200: log_with_time("not200: %s" % price["res"]) return try: price_json = jsonp_json(price_res["text"]) stock_json = jsonp_json(stock_res["text"].decode("gbk")) except: traceback.print_exc() return prices = {} for i in price_json: prices[i["id"].split("_")[1]] = i["p"] stocks = {} for k,v in stock_json.items(): s = v["StockStateName"] if u"有货" in s or u"现货" in s: stocks[k] = 1 else: stocks[k] = 0 ret = [] for pid in prices: ret.append((str(pid), str(prices[pid]), stocks[pid])) return format_price(ret)
def get_work_list(author_url): if not author_url: pdb.set_trace() h, c = simple_http.get(author_url, proxy=proxy) if h["status"] != 200: pdb.set_trace() t = etree.HTML(c) urls = t.xpath(work_last) total = 0 ret = [] if len(urls) == 0: total = 0 elif len(urls) < 6: total = page_to_id(urls[-2].attrib["href"]) else: total = page_to_id(urls[-1].attrib["href"]) ret.extend( zip([x.attrib["src"] for x in t.xpath(work_title2)], [base + x.attrib["href"] for x in t.xpath(work_url)], [x.text for x in t.xpath(author_name)])) for i in range(2, total + 1): h, c = simple_http.get(author_url + "page=%d/" % i, proxy=proxy) t = etree.HTML(c) ret.extend( zip([x.attrib["src"] for x in t.xpath(work_title2)], [base + x.attrib["href"] for x in t.xpath(work_url)], [x.text for x in t.xpath(author_name)])) return ret
def get_work_list_digital(author_url): while True: h, c = simple_http.get(author_url, proxy=proxy) if h["status"] == 200: break t = etree.HTML(c) urls = t.xpath(p_work_last) total = 0 ret = [] if len(urls) == 0: total = 0 elif len(urls) < 6: total = page_to_id(urls[-2].attrib["href"]) else: total = page_to_id(urls[-1].attrib["href"]) ret.extend(zip([x.attrib["alt"] for x in t.xpath(p_work_title)], [x.attrib["href"] for x in t.xpath(p_work_url)])) for i in range(2, total+1): while True: h, c = simple_http.get(author_url + "page=%d/" % i, proxy=proxy) if h["status"] == 200: break t = etree.HTML(c) ret.extend(zip([x.attrib["alt"] for x in t.xpath(p_work_title)], [x.attrib["href"] for x in t.xpath(p_work_url)])) return ret
def get_work_list(author_url): if not author_url: pdb.set_trace() h, c = simple_http.get(author_url, proxy=proxy) if h["status"] != 200: pdb.set_trace() t = etree.HTML(c) urls = t.xpath(work_last) total = 0 ret = [] if len(urls) == 0: total = 0 elif len(urls) < 6: total = page_to_id(urls[-2].attrib["href"]) else: total = page_to_id(urls[-1].attrib["href"]) ret.extend(zip([x.attrib["src"] for x in t.xpath(work_title2)], [base+x.attrib["href"] for x in t.xpath(work_url)], [x.text for x in t.xpath(author_name)])) for i in range(2, total+1): h, c = simple_http.get(author_url + "page=%d/" % i, proxy=proxy) t = etree.HTML(c) ret.extend(zip([x.attrib["src"] for x in t.xpath(work_title2)], [base+x.attrib["href"] for x in t.xpath(work_url)], [x.text for x in t.xpath(author_name)])) return ret
def down_one(title, url): h, c = simple_http.get(url, proxy=proxy) if not h: pdb.set_trace() if h["status"] != 200: pdb.set_trace() t = etree.HTML(c) pics = t.xpath(image_xpath) torrent = [x for x in t.xpath(torrent_xpath) if x.attrib["href"].startswith("attach")] for i,v in enumerate(pics): name = ("%s-%d.jpg" % (title, i)).replace("/", "-") if os.path.exists(name): continue if not "jpg" in v.attrib["src"]: continue h, c = simple_http.get(v.attrib["src"], proxy=proxy) if h["status"] != 200: if h["status"] == 302: h, c = simple_http.get(h["Location"], proxy=proxy) if h["status"] != 200: pdb.set_trace() f = open(name, "w+") f.write(c) f.close() for i,v in enumerate(torrent): name = ("%s-%d.torrent" % (title, i)).replace("/", "-") if os.path.exists(name): continue h, c = simple_http.get(base+v.attrib["href"], proxy=proxy) if h["status"] != 200: pdb.set_trace() f = open(name, "w+") f.write(c) f.close()
def get_work_list_digital(author_url): while True: h, c = simple_http.get(author_url, proxy=proxy) if h["status"] == 200: break t = etree.HTML(c) urls = t.xpath(p_work_last) total = 0 ret = [] if len(urls) == 0: total = 0 elif len(urls) < 6: total = page_to_id(urls[-2].attrib["href"]) else: total = page_to_id(urls[-1].attrib["href"]) ret.extend( zip([x.attrib["alt"] for x in t.xpath(p_work_title)], [x.attrib["href"] for x in t.xpath(p_work_url)])) for i in range(2, total + 1): while True: h, c = simple_http.get(author_url + "page=%d/" % i, proxy=proxy) if h["status"] == 200: break t = etree.HTML(c) ret.extend( zip([x.attrib["alt"] for x in t.xpath(p_work_title)], [x.attrib["href"] for x in t.xpath(p_work_url)])) return ret
def down_one(title, url): h, c = simple_http.get(url, proxy=proxy) if not h: return if h["status"] != 200: return t = etree.HTML(c) pics = t.xpath(image_xpath) torrent = [ x for x in t.xpath(torrent_xpath) if x.attrib["href"].startswith("attach") ] for i, v in enumerate(pics): name = ("%s-%d.jpg" % (title, i)).replace("/", "-") if os.path.exists(name): continue if not "jpg" in v.attrib["src"]: continue try: h, c = simple_http.get(v.attrib["src"], proxy=proxy) except socket.timeout: continue except socket.error: continue if h["status"] != 200: if h["status"] == 302: location = h["Location"] if not location.startswith("http"): url = {} url["host"] = simple_http.urlparse(v.attrib["src"])["host"] url["path"] = location location = simple_http.generate_url(url) h, c = simple_http.get(location, proxy=proxy) if h["status"] != 200: continue if len(c) < 10240: continue try: f = open(name.decode("utf-8"), "wb+") except IOError as e: print e continue f.write(c) f.close() for i, v in enumerate(torrent): name = ("%s-%d.torrent" % (title, i)).replace("/", "-") if os.path.exists(name): continue h, c = simple_http.get(base + v.attrib["href"], proxy=proxy) if h["status"] != 200: continue try: f = open(name.decode("utf-8"), "wb+") except IOError as e: print e continue f.write(c) f.close()
def task_get_http(): run_http("get_http") try: os.wait() except KeyboardInterrupt: pass payload = {"t1": "v1", "t2": "v2"} simple_http.get("127.0.0.1:6711", query=payload) pdb.set_trace()
def download_link(name, link): h, c = simple_http.get(link, header=simple_http.download_header) if h["status"] != 200: if h["status"] == 302: h, c = simple_http.get(h["Location"], header=simple_http.download_header) if h["status"] != 200: pdb.set_trace() f = open(name, "w+") f.write(c) f.close()
def down_one(title, url): h, c = simple_http.get(url, proxy=proxy, timeout=20, cookie=cookie) if not h: return if h["status"] != 200: return t = etree.HTML(c) pics = t.xpath(img_xpath) + t.xpath(img2_xpath) torrent = [x for x in t.xpath(torrent_xpath) if x.attrib["href"].startswith("attach")] for i,v in enumerate(pics): name = ("%s-%d.jpg" % (title, i)).replace("/", "-") if os.path.exists(name): continue if not "jpg" in v.attrib["src"]: continue try: h, c = simple_http.get(v.attrib["src"], proxy=proxy) except socket.timeout: continue except socket.error: continue if h["status"] != 200: if h["status"] == 302: location = h["Location"] if not location.startswith("http"): url = {} url["host"] = simple_http.urlparse(v.attrib["src"])["host"] url["path"] = location location = simple_http.generate_url(url) h, c = simple_http.get(location, proxy=proxy) if h["status"] != 200: continue if len(c) < 10240: continue try: f = open(name, "wb+") except IOError as e: print e continue f.write(c) f.close() for i,v in enumerate(torrent): name = ("%s-%d.torrent" % (title, i)).replace("/", "-") if os.path.exists(name): continue h, c = simple_http.get(base+v.attrib["href"], proxy=proxy, cookie=cookie, timeout=20) if h["status"] != 200: continue try: f = open(name, "wb+") except IOError as e: print e continue f.write(c) f.close()
def download_link(name, link): h, c = simple_http.get(link, header=simple_http.download_header) if h["status"] != 200: if h["status"] == 302: h, c = simple_http.get(h["Location"], header=simple_http.download_header) if h["status"] != 200: pdb.set_trace() f = open(name, "wb+") f.write(c) f.close()
def task_get_http(): run_http("get_http") try: os.wait() except KeyboardInterrupt: pass payload = { "t1": "v1", "t2": "v2" } simple_http.get("127.0.0.1:6711", query = payload) pdb.set_trace()
def get_info(work_url): h, c = simple_http.get(work_url, proxy=proxy) t = etree.HTML(c) tags = t.xpath(skip_xpath) for i in tags: if i.text == skip: return {} try: cover_img = t.xpath(cover)[0].attrib["href"] except: cover_img = "" pics = [replace(bigpic, x.attrib["src"], "jp") for x in t.xpath(photo)] try: name = t.xpath(title)[0].text except: name = "" try: id = t.xpath(aid)[-2].text except: id = "" return { "cover": cover_img, "pics": pics, "name": name, "aid": id }
def get_category_dvd(): h, c = simple_http.get(dvd, proxy=proxy) t = etree.HTML(c) urls = t.xpath(proun) for i in urls: print utf8(i.text) get_author_list(base+i.attrib["href"])
def sis_login(user, password): h, c = simple_http.get(base + "logging.php?action=login", proxy=proxy, timeout=10) t = etree.HTML(c.decode("gbk")) formhash = t.xpath("/html/body/div[4]/div[1]/div[4]/div[1]/form/input[1]" )[0].attrib["value"].encode("utf-8") referer = t.xpath("/html/body/div[4]/div[1]/div[4]/div[1]/form/input[2]" )[0].attrib["value"].encode("utf-8") payload = { "formhash": formhash, "referer": base + referer, "cookietime": "2592000", "loginfield": "username", "username": user, "password": password, "questionid": "", "answer": "", "loginsubmit": "true" } h, c = simple_http.post(base + "logging.php?action=login", proxy=proxy, payload=payload, timeout=10) return simple_http.client_cookie(h["Set-Cookie"])
def get_category_dvd(): h, c = simple_http.get(dvd, proxy=proxy) t = etree.HTML(c) urls = t.xpath(proun) for i in urls: print utf8(i.text) get_author_list(base + i.attrib["href"])
def get_page(t, i): url = "%sforum-%s-%d.html" % (base, t, i) h, c = simple_http.get(url, cookie=cookie, proxy=proxy, timeout=10) t = etree.HTML(c.decode("gbk")) al = t.xpath(p_xpath) + t.xpath(p2_xpath) for i, v in enumerate(al): print "[%d/%d] %s" % (i + 1, len(al), v.text) get_content(v.text, v.attrib["href"])
def do(url, n): h, content = simple_http.get(url % n) if h["status"] != 200: pdb.set_trace() s = etree.HTML(content) videos = s.xpath(xpath) for i in videos: print "%s\n%s" % (i.attrib["title"], i.attrib["href"])
def down(url, output, proxy=""): res = simple_http.get(url, proxy=proxy, header=simple_http.download_header) if res["status"] != 200: pprint.pprint(h) exit(1) f = open(output, "wb+") f.write(res["text"]) f.close()
def get_page(t, i): url = "%sforum-%s-%d.html" % (base, t, i) h, c = simple_http.get(url, cookie = cookie, proxy=proxy, timeout=10) t = etree.HTML(c.decode("gbk")) al = t.xpath(p_xpath)+t.xpath(p2_xpath) for i,v in enumerate(al): print "[%d/%d] %s" % (i+1, len(al), v.text) get_content(v.text, v.attrib["href"])
def down(url, output, proxy=""): h, c = simple_http.get(url, proxy=proxy, header=simple_http.download_header) if h["status"] != 200: pprint.pprint(h) exit(1) f = open(output, "w+") f.write(c) f.close()
def down(url, output, proxy=""): h, c = simple_http.get(url, proxy=proxy, header=simple_http.download_header) if h["status"] != 200: pprint.pprint(h) exit(1) f = open(output, "wb+") f.write(c) f.close()
def down_one(title, url): h, c = simple_http.get(url, proxy=proxy) if not h: pdb.set_trace() if h["status"] != 200: pdb.set_trace() t = etree.HTML(c) pics = t.xpath(image_xpath) torrent = [x for x in t.xpath(torrent_xpath) if x.attrib["href"].startswith("attach")] for i,v in enumerate(pics): name = ("%s-%d.jpg" % (title, i)).replace("/", "-") if os.path.exists(name): continue if not "jpg" in v.attrib["src"]: continue try: h, c = simple_http.get(v.attrib["src"], proxy=proxy) except socket.timeout: continue if h["status"] != 200: if h["status"] == 302: location = h["Location"] if not location.startswith("http"): url = {} url["host"] = simple_http.urlparse(v.attrib["src"])["host"] url["path"] = location location = simple_http.generate_url(url) h, c = simple_http.get(location, proxy=proxy) if h["status"] != 200: pdb.set_trace() f = open(name, "w+") f.write(c) f.close() for i,v in enumerate(torrent): name = ("%s-%d.torrent" % (title, i)).replace("/", "-") if os.path.exists(name): continue h, c = simple_http.get(base+v.attrib["href"], proxy=proxy) if h["status"] != 200: pdb.set_trace() f = open(name, "w+") f.write(c) f.close()
def get_author_list(category): h, c = simple_http.get(category, proxy=proxy) t = etree.HTML(c) urls = t.xpath(last) total = 0 if len(urls) == 0: total = 0 elif len(urls) < 6: total = page_to_id(urls[-2].attrib["href"]) else: total = page_to_id(urls[-1].attrib["href"]) for i in t.xpath(p): print utf8(i.xpath("string()")) print utf8(base + i.attrib["href"]) for i in range(2, total + 1): h, c = simple_http.get(category + "/page=%d" % i, proxy=proxy) t = etree.HTML(c) for i in t.xpath(p): print utf8(i.xpath("string()")) print utf8(base + i.attrib["href"])
def get_author_list(category): h, c = simple_http.get(category, proxy=proxy) t = etree.HTML(c) urls = t.xpath(last) total = 0 if len(urls) == 0: total = 0 elif len(urls) < 6: total = page_to_id(urls[-2].attrib["href"]) else: total = page_to_id(urls[-1].attrib["href"]) for i in t.xpath(p): print utf8(i.xpath("string()")) print utf8(base+i.attrib["href"]) for i in range(2, total+1): h, c = simple_http.get(category + "/page=%d" % i, proxy=proxy) t = etree.HTML(c) for i in t.xpath(p): print utf8(i.xpath("string()")) print utf8(base+i.attrib["href"])
def redis_proxy_get(): import simple_http redis_proxy_connect() res = simple_http.get("http://127.0.0.1:8866/redis_proxy", query={ "node": "test", "type": "list", "key": "jd_page", "batch": "10" }) pdb.set_trace() assert res["status"] == 200
def redis_proxy_connect(): import simple_http import json res = simple_http.get("http://127.0.0.1:8866/connect", query={ "node": "test", "db": json.dumps({ "host": "127.0.0.1", "port": 6379 }) }) assert res["status"] == 200
def get_categroy_digital(): p1 = ["", "k", "s", "t", "n", "h", "m", "y", "r", "w"] p2 = ["a", "i", "u", "e", "o"] pages = [] for i in p1: pages.extend(["%s=/keyword=%s/" % (digital_base, i + x) for x in p2]) authors = [] for i in pages: h, c = simple_http.get(i, proxy=proxy) t = etree.HTML(c) for i in t.xpath(p_digital) + t.xpath(ps_digital): print utf8(i.xpath("string()")) print utf8(i.attrib["href"])
def get_categroy_digital(): p1 = ["", "k", "s", "t", "n", "h", "m", "y", "r", "w"] p2 = ["a", "i", "u", "e", "o"] pages = [] for i in p1: pages.extend(["%s=/keyword=%s/" % (digital_base, i+x) for x in p2]) authors = [] for i in pages: h, c = simple_http.get(i, proxy=proxy) t = etree.HTML(c) for i in t.xpath(p_digital) + t.xpath(ps_digital): print utf8(i.xpath("string()")) print utf8(i.attrib["href"])
def down_pic(name, pic_url): print name, pic_url if os.path.exists(name): return h, c = simple_http.get(pic_url, proxy=proxy) if h["status"] != 200: print "skip", pic_url return try: f = open(name, "wb+") except IOError: f = open(name.split("-")[-1], "wb+") f.write(c) f.close()
def rt_parser(items): pids = get_pids(items) if not pids: log_with_time("got nothing: %s" % entries) return purl = price_url % (",".join([ "J_" + i for i in pids ]), random.randint(1000000, 10000000), int(time.time() * 1000)) surl = stock_url % (async_http.quote(",".join([ i for i in pids ])), random.randint(1000000, 10000000), int(time.time() * 1000)) price_res = simple_http.get(purl) stock_res = simple_http.get(surl) if price_res["status"] != 200 or stock_res["status"] != 200: log_with_time("not200: %s" % price["res"]) return try: price_json = jsonp_json(price_res["text"]) stock_json = jsonp_json(stock_res["text"].decode("gbk")) except: traceback.print_exc() return prices = {} for i in price_json: prices[i["id"].split("_")[1]] = i["p"] stocks = {} for k, v in stock_json.items(): s = v["StockStateName"] if u"有货" in s or u"现货" in s: stocks[k] = 1 else: stocks[k] = 0 ret = [] for pid in prices: ret.append((str(pid), str(prices[pid]), stocks[pid])) return format_price(ret)
def download_album(aid): header, content = simple_http.get("http://music.baidu.com/album/%s" % aid) if header["status"] != 200: print "failed" print header exit(1) t = etree.HTML(content) songs = [] for i in t.xpath(ALBUM_XPATH): songs.append((i.attrib["title"], i.attrib["href"])) for i, v in enumerate(songs): link = download_by_id(v[1].strip("/song/")) if not link: continue print "===================\n[%d/%d]: %s\n%s" % (i+1, len(songs), link[0], link[1])
def down_page(tid, pid): h, c = simple_http.get(thread_base % (tid, pid), proxy=proxy) if h["status"] != 200: pdb.set_trace() t = etree.HTML(c) url = [] if pid == 1: a = t.xpath(p1_xpath) b = t.xpath(p1_special) url.extend(a) url.extend(b) else: url.extend(t.xpath(pn_xpath)) for i,v in enumerate(url): print "[p%d-%d]: %s" % (pid, i+1, v.text) down_one(v.text.encode("utf-8"), base + v.attrib["href"])
def download_album(aid): header, content = simple_http.get("http://music.baidu.com/album/%s" % aid) if header["status"] != 200: print "failed" print header exit(1) t = etree.HTML(content) songs = [] for i in t.xpath(ALBUM_XPATH): songs.append((i.attrib["title"], i.attrib["href"])) for i, v in enumerate(songs): link = download_by_id(v[1].strip("/song/")) if not link: continue print "===================\n[%d/%d]: %s\n%s" % (i + 1, len(songs), link[0], link[1])
def get_songids(uid): ret = [] for i in range(0xffff): query = {"start": str(i * 25), "ting_uid": uid, "order": "hot"} h, c = simple_http.get("http://music.baidu.com/data/user/getsongs", query=query) if h["status"] != 200: break t = json.loads(c)["data"]["html"] tree = etree.HTML(t) result = [x for x in tree.xpath(SONGID_XPATH)] if not result: break for i in result: if not "class" in i.attrib: ret.append((i.attrib["title"], ID_PATTERN.match(i.attrib["href"]).groups()[0])) return ret
def sis_login(user, password): h, c = simple_http.get(base+"logging.php?action=login", proxy=proxy, timeout=10) t = etree.HTML(c.decode("gbk")) formhash = t.xpath("/html/body/div[4]/div[1]/div[4]/div[1]/form/input[1]")[0].attrib["value"].encode("utf-8") referer = t.xpath("/html/body/div[4]/div[1]/div[4]/div[1]/form/input[2]")[0].attrib["value"].encode("utf-8") payload = { "formhash": formhash, "referer": base+referer, "cookietime": "2592000", "loginfield": "username", "username": user, "password": password, "questionid": "", "answer": "", "loginsubmit": "true" } h, c = simple_http.post(base+ "logging.php?action=login", proxy=proxy, payload = payload, timeout=10) return simple_http.client_cookie(h["Set-Cookie"])
def post(self): m = self.get_arguments("sfile") t = self.get_arguments("stype") e = self.get_arguments("sexpr") if not m or not t or not e: self.write("arguments not enough") return m = m[0] t = t[0] e = e[0] fn = None if not m.startswith("file://"): fn = "list.html" f = open("list.html", "w") res = simple_http.get(m) if res["status"] != 200: self.write("err, bad response. %r" % h["status"]) return f.write(res["text"]) f.close() else: fn = m[7:] e = e.encode("utf-8") p = "" if t == "xpath": p = "," elif t == "text": p = "-" elif t == "line": p = ">" elif t == "attr": p = "." try: rt = subprocess.check_output(["./qxt", fn, p + e], stderr=sys.stderr) except Exception as e: import traceback traceback.print_exc() self.write("call failed.") return self.write(rt)
def get_songids(uid): ret = [] for i in range(0xffff): query = { "start": str(i * 25), "ting_uid": uid, "order": "hot" } h, c =simple_http.get("http://music.baidu.com/data/user/getsongs", query=query); if h["status"] != 200: break t = json.loads(c)["data"]["html"] tree = etree.HTML(t) result = [x for x in tree.xpath(SONGID_XPATH)] if not result: break for i in result: if not "class" in i.attrib: ret.append((i.attrib["title"], i.attrib["href"].split("#")[0])) return ret
def down_page(tid, pid): h, c = simple_http.get(thread_base % (tid, pid), proxy=proxy) if h["status"] != 200: return t = etree.HTML(c) url = [] if pid == 1: a = t.xpath(p1_xpath) b = t.xpath(p1_special) url.extend(a) url.extend(b) else: url.extend(t.xpath(pn_xpath)) for i,v in enumerate(url): try: down_one(v.text.encode("utf-8"), base + v.attrib["href"]) except OSError: print "skip, [p%d-%d]: %s" % (pid, i+1, v.text) continue print "[p%d-%d]: %s" % (pid, i+1, v.text)
def down_page(tid, pid): h, c = simple_http.get(thread_base % (tid, pid), proxy=proxy) if h["status"] != 200: return t = etree.HTML(c) url = [] if pid == 1: a = t.xpath(p1_xpath) b = t.xpath(p1_special) url.extend(a) url.extend(b) else: url.extend(t.xpath(pn_xpath)) for i, v in enumerate(url): try: down_one(v.text.encode("utf-8"), base + v.attrib["href"]) except OSError: print "skip, [p%d-%d]: %s" % (pid, i + 1, v.text) continue print "[p%d-%d]: %s" % (pid, i + 1, v.text)
def post(self): m = self.get_arguments("sfile") t = self.get_arguments("stype") e = self.get_arguments("sexpr") if not m or not t or not e: self.write("arguments not enough") return m = m[0] t = t[0] e = e[0] fn = None if not m.startswith("file://"): fn = "list.html" f = open("list.html", "w") res = simple_http.get(m) if res["status"] != 200: self.write("err, bad response. %r" % h["status"]) return f.write(res["text"]) f.close() else: fn = m[7:] e= e.encode("utf-8") p = "" if t == "xpath": p = "," elif t == "text": p = "-" elif t == "line": p = ">" elif t == "attr": p = "." try: rt = subprocess.check_output(["./qxt", fn, p+e], stderr=sys.stderr) except Exception as e : import traceback traceback.print_exc() self.write("call failed.") return self.write(rt)
def get_info(work_url): h, c = simple_http.get(work_url, proxy=proxy) t = etree.HTML(c) tags = t.xpath(skip_xpath) for i in tags: if i.text == skip: return {} try: cover_img = t.xpath(cover)[0].attrib["href"] except: cover_img = "" pics = [replace(bigpic, x.attrib["src"], "jp") for x in t.xpath(photo)] try: name = t.xpath(title)[0].text except: name = "" try: id = t.xpath(aid)[-2].text except: id = "" return {"cover": cover_img, "pics": pics, "name": name, "aid": id}
def get_content(title, url): h, c = simple_http.get(base+url, cookie=cookie, proxy=proxy, timeout=10) t = etree.HTML(c) down_one(title, base+url)
def get_content(title, url): h, c = simple_http.get(base + url, cookie=cookie, proxy=proxy, timeout=10) t = etree.HTML(c) down_one(title, base + url)
def getpage(url, nsecs=5): try: a = time.time() h, content = simple_http.get(url) except Exception as e: raise Exception("request failed: %s error %s", (url, e)) print "=========\npage done in %fs: %s\ntimeout: %ds\n=========" % (time.time() -a, url, nsecs) try: t = etree.HTML(content) except: print "fetch failed: %s" % url pprint.pprint(h) exit(1) urls = [] host = simple_http.urlparse(url)["host"] #find all script, img for i in etree_util.query_element(t, "[script,img,link]"): attrib = i.attrib if "href" in attrib: url_dict = simple_http.urlparse(attrib["href"]) if not url_dict["host"]: url_dict["host"] = host urls.append(simple_http.generate_url(url_dict)) if "src" in attrib: url_dict = simple_http.urlparse(attrib["src"]) if not url_dict["host"]: url_dict["host"] = host urls.append(simple_http.generate_url(url_dict)) #multiprocess get pids = [] for i in urls: pid = os.fork() if not pid: try: a = time.time() simple_http.get(i) print "url done in %fs %s" % (time.time() - a, i) except Exception as e: print "url failed: %s" % i print "error %s" % e traceback.print_tb(sys.exc_info()[2]) exit(0) else: pids.append(pid) #wait children for nsecs def clean_children(signum, frame): for i in urls: pid, _, _ = os.wait3(os.WNOHANG) if pid: del pids[pids.index(pid)] #kill them if they are still in progress for i in pids: os.kill(i, signal.SIGKILL) for i in pids: os.wait() print "request done, kill %d children" % len(pids) signal.setitimer(signal.ITIMER_REAL, nsecs) signal.signal(signal.SIGINT, clean_children) signal.signal(signal.SIGALRM, clean_children) #block time.sleep(0xffff)
def redis_proxy_connect(): import simple_http import json res = simple_http.get("http://127.0.0.1:8866/connect", query={"node": "test", "db": json.dumps({"host": "127.0.0.1", "port": 6379})}) assert res["status"] == 200
def redis_proxy_get(): import simple_http redis_proxy_connect() res = simple_http.get("http://127.0.0.1:8866/redis_proxy", query={"node": "test", "type": "list", "key": "jd_page", "batch": "10"}) pdb.set_trace() assert res["status"] == 200
# -*- coding: utf-8 -*- import datetime import PyRSS2Gen import simple_http import json if __name__ == '__main__': header, content = simple_http.get('http://api.acfun.tv/apiserver/content/channel?orderBy=1&channelId=110&pageSize=20&pageNo=1') x_content = json.loads(content) items = [] list = x_content['data']['page']['list'] for i in range(0, len(list)): item_json = list[i]; items.append(PyRSS2Gen.RSSItem( title=item_json['title'], link='http://www.acfun.tv/a/ac' + str(item_json['contentId']), description=item_json['description'], guid=PyRSS2Gen.Guid('http://www.acfun.tv/a/ac' + str(item_json['contentId'])), pubDate=datetime.datetime.now())) rss = PyRSS2Gen.RSS2( title='Acfun 文章区', link='http://www.acfun.tv/v/list110/index.htm', description='Acfun 文章区', lastBuildDate=datetime.datetime.now(), items=items) rss.write_xml(open("D:\\feed.xml", "w"), encoding="utf-8")