def pager(task, rule): try: tree = etree.HTML(task["text"]) except: traceback.print_exc() return count = tree.xpath(rule["normal"]) if not count: log_with_time("pager, no count: %s" % task["url"]) return count = int(count[0]) if count > 450: log_with_time("need split page: %s" % task["url"]) url = task["url"] cats = re.findall("=([0-9,]+)", url) price_range = re.findall("(ev=.*%40)", url) if price_range: base = "%s&%s" % (normal_base, price_range[0]) else: base = normal_base ret = [] if not cats: log_with_time("no cats in url: %s" % url) return if task.get("limit"): count = min((task.get("limit"), count)) for i in range(1, count + 1): ret.append(base.format(async_http.quote(cats[0]), i)) return ret
def pager(task, rule): try: tree = etree.HTML(task["text"]) except: traceback.print_exc() return count = tree.xpath(rule["normal"]) if not count: log_with_time("pager, no count: %s" % task["url"]) return count = int(count[0]) if count > 450: log_with_time("need split page: %s" % task["url"]) url = task["url"] cats = re.findall("=([0-9,]+)", url) price_range = re.findall("(ev=.*%40)", url) if price_range: base = "%s&%s" % (normal_base, price_range[0]) else: base = normal_base ret = [] if not cats: log_with_time("no cats in url: %s" % url); return if task.get("limit"): count = min((task.get("limit"), count)) for i in range(1, count+1): ret.append(base.format(async_http.quote(cats[0]), i)) return ret
def rt_parser(items): pids = get_pids(items) if not pids: log_with_time("got nothing: %s" % entries) return purl = price_url % (",".join(["J_" + i for i in pids]), random.randint(1000000, 10000000), int(time.time() * 1000)) surl = stock_url % (async_http.quote(",".join([i for i in pids])), random.randint(1000000, 10000000), int(time.time() * 1000)) price_res = simple_http.get(purl) stock_res = simple_http.get(surl) if price_res["status"] != 200 or stock_res["status"] != 200: log_with_time("not200: %s" % price["res"]) return try: price_json = jsonp_json(price_res["text"]) stock_json = jsonp_json(stock_res["text"].decode("gbk")) except: traceback.print_exc() return prices = {} for i in price_json: prices[i["id"].split("_")[1]] = i["p"] stocks = {} for k,v in stock_json.items(): s = v["StockStateName"] if u"有货" in s or u"现货" in s: stocks[k] = 1 else: stocks[k] = 0 ret = [] for pid in prices: ret.append((str(pid), str(prices[pid]), stocks[pid])) return format_price(ret)
def stock_filter(items): keys = async_http.quote(",".join(items.keys())) url = stock_url % (keys, random.randint(1000000, 10000000), int(time.time() * 1000)) return { "url": url, "price": items }
def rt_parser(items): pids = get_pids(items) if not pids: log_with_time("got nothing: %s" % entries) return purl = price_url % (",".join([ "J_" + i for i in pids ]), random.randint(1000000, 10000000), int(time.time() * 1000)) surl = stock_url % (async_http.quote(",".join([ i for i in pids ])), random.randint(1000000, 10000000), int(time.time() * 1000)) price_res = simple_http.get(purl) stock_res = simple_http.get(surl) if price_res["status"] != 200 or stock_res["status"] != 200: log_with_time("not200: %s" % price["res"]) return try: price_json = jsonp_json(price_res["text"]) stock_json = jsonp_json(stock_res["text"].decode("gbk")) except: traceback.print_exc() return prices = {} for i in price_json: prices[i["id"].split("_")[1]] = i["p"] stocks = {} for k, v in stock_json.items(): s = v["StockStateName"] if u"有货" in s or u"现货" in s: stocks[k] = 1 else: stocks[k] = 0 ret = [] for pid in prices: ret.append((str(pid), str(prices[pid]), stocks[pid])) return format_price(ret)