def list_parser(task, rule): #pr.enable() try: t = etree.HTML(task['text'].decode('gbk', 'replace')) except: log_with_time("bad response %s"%task['url']) return ret = [] if ebookurl in task['url']: r = __book_list_parser1(t, task, rule) elif 'cp' in task['url']: r = __book_list_parser2(t, task, rule) else: r = __norm_list_parser1(t, task, rule) ret, comments, shop, dp = r #pr.disable() #pr.print_stats() fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return { "result": fret, "dps": dps, "shop": shop, "comment": comments, "dp": dp, }
def list_parser(task, rule): ret = [] dps = [] rr = re.search("items\(", task['text']) if not rr: log_with_time("bad response: %r" % task['url']) return ret j = json.loads(task['text'][rr.end():-1]) if not 'data' in j: log_with_time("bad response: %r" % task['url']) return ret for i in j['data']['product']: if not i['is_cos']: stock = 1 else: stock = 0 price = i['price_min'] if not price: log_with_time("bad response: %r %r" % (task['url'], i['commodity_id'])) continue dps.append((i["url"], "")) ret.append((str(i['url']), str(price), stock)) fret = format_price(ret) return { "spider": fret, "dp": dps }
def rt_parser(items): pids = get_pids(items) if not pids: log_with_time("got nothing: %s" % entries) return purl = price_url % (",".join(["J_" + i for i in pids]), random.randint(1000000, 10000000), int(time.time() * 1000)) surl = stock_url % (async_http.quote(",".join([i for i in pids])), random.randint(1000000, 10000000), int(time.time() * 1000)) price_res = simple_http.get(purl) stock_res = simple_http.get(surl) if price_res["status"] != 200 or stock_res["status"] != 200: log_with_time("not200: %s" % price["res"]) return try: price_json = jsonp_json(price_res["text"]) stock_json = jsonp_json(stock_res["text"].decode("gbk")) except: traceback.print_exc() return prices = {} for i in price_json: prices[i["id"].split("_")[1]] = i["p"] stocks = {} for k,v in stock_json.items(): s = v["StockStateName"] if u"有货" in s or u"现货" in s: stocks[k] = 1 else: stocks[k] = 0 ret = [] for pid in prices: ret.append((str(pid), str(prices[pid]), stocks[pid])) return format_price(ret)
def list_parser(task): rule = task["rule"] content = task["recv"].getvalue() ret = [] dyn_items = re.findall('({.*?sell_price.*?}),', content) for i in dyn_items: try: item = json.loads(i) except ValueError: continue link = item_base % item["id"] price = item["sell_price"] ret.append((link, price, 1)) t = etree.HTML(content) nodes = t.xpath(rule["rule"]) for node in nodes: link = node.xpath("div[@class = 'cat-item-pic']/a/@href") price = node.xpath( "figcaption[@class = 'cat-item-inf']/p/span[@class = 'cat-pire-nub']/text()" ) if not link or not price: log_with_time("rule error: %s" % task["url"]) continue ret.append((link[0], price[0], 1)) result = format_price(ret) return result
def list_parser(task, rule): ret = [] dps = [] rr = re.search("items\(", task['text']) if not rr: log_with_time("bad response: %r" % task['url']) return ret j = json.loads(task['text'][rr.end():-1]) if not 'data' in j: log_with_time("bad response: %r" % task['url']) return ret for i in j['data']['product']: if not i['is_cos']: stock = 1 else: stock = 0 price = i['price_min'] if not price: log_with_time("bad response: %r %r" % (task['url'], i['commodity_id'])) continue dps.append((i["url"], "")) ret.append((str(i['url']), str(price), stock)) fret = format_price(ret) return {"spider": fret, "dp": dps}
def stock1_parser(task, rule): try: j = demjson.decode(task['text']) except: log_with_time("bad response: %r"%task['url']) return [] code = j['code'] message = j['message'] url = "" ret = {"spider":[], 'stock2':[]} if code == 3 and message: try: skuid = re.search("\d+", message).group() url = surl2(task['gid'], skuid) except: return [] if url == "": #print(task['text']) stock = 1 if j.get('totalAmount') else 0 ret['spider'] = format_price([(itemurl+task['gid'], task['price'], stock)]) else: ret['stock2'] = [(url, task['gid'], task['price'])] return ret
def book_price(task, rule): try: j = json.loads(task['text']) price = j['price'][0]['proPrice'] #if j['price'] else 0 except: log_with_time("bad response: %s" % task['link']) return return format_price([[str(task['qid']), str(price), task['stock']]])
def price_parser(task, rule): try: price = re.search("(?<=price\:)\d+\.\d+(?=\,)", task['text']).group() except: log_with_time("bad response: %r"%task['url']) return [] ret = [(task['gid'], price, task['stock'])] fret = format_price(ret) return fret
def stock_parser(task, rule): try: j = json.loads(task['text']) stock = 1 if j['havestock'] in ("true", "realstock") else 0 except: log_with_time("bad response %s"%task['url']) return ret = [(itemurl % task['info'][0], str(task['info'][1]), stock)] fret = format_price(ret) return fret
def stock2_parser(task, rule): success = re.search("(?<=success\":).+(?=,)", task['text']) if success and success.group() == 'true': stock = 1 else: stock = 0 ret = [(itemurl+task['gid'], task['price'], stock)] fret = format_price(ret) return fret
def meizhuang_cats_parser(url, content, rule): t = etree.HTML(content) ret = [] for node in t.xpath(rule[0]): #link link = node.xpath(rule[1]) #price price = node.xpath(rule[2]) if not link or not price: log_with_time("rule error: %s" % url) ret.append((link[0], price[0], 1)) result = format_price(ret) return result
def price_parser(task, rule): try: price = _tess.recognize(task["text"], _tess.IMAGE_PNG, 32) except: log_with_time("invalid image: %s" % task["link"]) return p = fix_price(price) if not p: log_with_time("no price: %s" % task["link"]) return log_with_time("%s %s %d" % (task["link"], p, task["stock"])) ret = [(task["qid"], p, task["stock"])] return format_price(ret)
def checkoffline(task, rule): try: j = json.loads(task['text']) j = j['items'] except: log_with_time("bad response %s"%task['url']) return ret = [] for k,v in j.items(): if not v['is_found']: ret.append((str(k), str(-1), -1)) fret = format_price(ret) return fret
def price_parser(task, rule): price = _tess.recognize(task['text'], _tess.IMAGE_PNG, 32) try: price = re.search("\d+\.\d+|\d+", price).group() except: log_with_time("bad price: %s" % task['url']) return ret = [(task['gid'], price, task['stock'])] fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return {"result":fret, "dps": dps}
def list_parser(task): t = etree.HTML(task["recv"].getvalue()) nodes = t.xpath(task["rule"]["rule"]) ret = [] for node in nodes: link = node.xpath("div/div[@class = 'proTit']/a/@href") price = node.xpath("div/div[@class = 'proPrice']/text()") if not link or not price: log_with_time("rule error: %s" % task["old_url"]) continue p = fix_price(price[0]) ret.append((link[0], p, 1)) result = format_price(ret) return result
def stock_parser(task, rule): t = etree.HTML(task["text"]) stock = t.xpath(rule) ret = [] if not stock: log_with_time("bad response: %s" % task["url"]) return ret if int(stock[0].text): stock = 1 else: stock = 0 ret.append((task['url'], task['price'], stock)) fret = format_price(ret) return fret
def stock_parser(task, rule): t = etree.HTML(task['text']) s = etree.tostring(t) if '售罄' in s: stock = 0 else: stock = 1 ret = [(task['url'], task['price'], stock)] fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return {"result": fret, "dps": dps}
def stock_parser(task, rule): t = etree.HTML(task["text"]) s = etree.tostring(t) if "售罄" in s: stock = 0 else: stock = 1 ret = [(task["url"], task["price"], stock)] fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return {"result": fret, "dps": dps}
def list_parser(task, rule): import pdb t = etree.HTML(task["text"]) nodes = t.xpath(rule["node"]) ret = [] for node in nodes: link = node.xpath(rule["link"]) price = node.xpath(rule["price"]) if not link or not price: log_with_time("rule error: %s" % task["old_url"]) continue p = fix_price(price[0]) ret.append((str(link[0]), str(p), 1)) result = format_price(ret) return result
def stock_parser(task): j = json.loads(task["recv"].getvalue()) if not j["res"]: log_with_time("bad response: %s" % task["url"]) return ret = [] for i in j["data"]: if int(i["s"]): stock = 1 else: stock = 0 price = i["sp"] qid = i["id"] ret.append((qid, price, stock)) result = format_price(ret) return result
def stock_parser(task, rule): try: j = json.loads(task['text']) except: log_with_time("bad response %s"%task['url']) return ret = [] inv = j.get('inventory') stock = sum(v for k,v in inv.items()) if inv else 0 if stock: stock = 1 ret.append((task['gurl'], task['price'], stock)) fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return {"result":fret, "dps":dps}
def stock_parser(task, rule): try: j = jsonp_json(task["text"].decode("utf-8")) except Exception as e: log_with_time("response error: %s %s" % (task["url"], e)) return ret= [] for item in j.get("PromPriceList", []): pid = str(item["ProductId"]) price = item["PromPriceShow"] if item["Stock"]: stock = 1 else: stock = 0 ret.append((pid, str(price), stock)) return format_price(ret)
def stock_parser(task, rule): try: nodes = jsonp_json(task["text"].decode("gbk"))["result"] except: log_with_time("bad response %s"%task['url']) return ret = [] for i in nodes: gid = i['id'] try: price = i['price']['buyPrice']['priceValue'] except: price = -1 stock = 1 if i['sellable'] else 0 ret.append((str(itemurl%gid), str(price), stock)) fret = format_price(ret) return fret
def stock_parser(task, rule): try: nodes = jsonp_json(task["text"].decode("gbk"))["result"] except: log_with_time("bad response %s" % task['url']) return ret = [] for i in nodes: gid = i['id'] try: price = i['price']['buyPrice']['priceValue'] except: price = -1 stock = 1 if i['sellable'] else 0 ret.append((str(itemurl % gid), str(price), stock)) fret = format_price(ret) return fret
def stock_parser(task, rule): try: j = json.loads(task["text"]) except: log_with_time("bad response %s" % task["url"]) return ret = [] inv = j.get("inventory") stock = sum(v for k, v in inv.items()) if inv else 0 if stock: stock = 1 ret.append((task["gurl"], task["price"], stock)) fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return {"result": fret, "dps": dps}
def price_parser(task, rule): j = json.loads(task['text']) stocks = task['stock'] ret = [] try: for k, v in j['data'].items(): ret.append((str(itemurl % k), str(v['np']), stocks[k])) except: pdb.set_trace() return [] fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return {"result": fret, "dps": dps}
def price_parser(task, rule): try: j = json.loads(task['text']) except: log_with_time("bad response: %s"%task['url']) return if not j: log_with_time("bad request: %s"% task["payload"]) return ret = [] for k,v in j.items(): stock = 1 if v['cart_class'] else 0 ret.append([str(itemurl%k), str(v['price']), stock]) fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return {"result": fret, "dps": dps}
def price_parser(task, rule): j = json.loads(task['text']) stocks = task['stock'] ret = [] try: for k,v in j['data'].items(): ret.append((str(itemurl%k), str(v['np']), stocks[k])) except: pdb.set_trace() return [] fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return {"result":fret, "dps":dps}
def stock_parser(task, rule): ret = [] ostock = re.search("暂时无货", task['text']) gid = task["url"] price = task["price"] if ostock: stock = 0 else: stock = 1 ret = [(gid, price, stock)] fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return {"result":fret, "dps":dps}
def list_parser(task, rule): try: t = etree.HTML(task['text']) nodes = t.xpath(rule["nodes"]) except: log_with_time("bad response %s" % task['url']) return if not nodes: log_with_time("bad rule %s" % task['url']) return ret = [] comments = {} promos = [] for node in nodes: gid = node.xpath(rule["gid"]) price = node.xpath(rule["price"]) stock = 1 if node.xpath(rule["stock"]) else 0 if not gid or not price: log_with_time("bad rule %s" % task['url']) continue gid = gid[0] price = price[0].text ret.append((gid, price, stock)) comment = node.xpath(rule['comment']) if not comment: log_with_time("bad rule for comments: %s" % task['url']) comment = ['0'] _gid = re_gid.search(gid).group() comments[_gid] = re.search("\d+", ','.join(comment)).group() promos.append(_gid) fret = format_price(ret) dp = [] for i in ret: dp.append((i[0], "")) dps = {} for i in fret: dps[i[1]] = int(time.time()) return { "result": fret, "dps": dps, "comment": comments, "dp": dp, "promos": promos }
def list_parser(task, rule): item = jsonp_json(task["text"]) skus = [] groups = [] dp_pairs = [] if "products" not in item: log_with_time("found nothing: %s" % task["url"]) return now = int(time.time()) dps_log = {} shop = {} comment = {} promos = [] for p in item["products"]: try: s = p["skus"] price = str(s["price"]) url = str(s["sUrl"]) title = s["name"] except KeyError: log_with_time("rule error: %s" % task["text"]) continue dp_pairs.append((url, title)) if s["stock"] > 0: stock = 1 else: stock = 0 promos.append((url, s["skuNo"])) skus.append((url, price, stock)) if p.get("shopId"): shop[get_crc(url)] = "%s,%s" % (p["shopId"], p["sName"]) if p.get("evaluateCount"): comment[get_crc(url)] = int(p["evaluateCount"]) result = format_price(skus) for r in result: dps_log[r[1]] = now return { "spider": result, "dp": dp_pairs, "dps_log": dps_log, "shop": shop, "comment": comment, "promo": promos, }
def stock_parser(task, rule): text = task["text"] if not text.startswith("{"): text = text[text.find("{"):] j = json.loads(text) if not j["res"]: log_with_time("bad response: %s" % task["url"]) return ret = [] for i in j["data"]: if int(i["s"]): stock = 1 else: stock = 0 price = i["sp"] qid = i["id"] ret.append((str(qid), str(price), stock)) result = format_price(ret) return result
def stock_parser(task, rule): try: j = json.loads(task["text"]) except ValueError: log_with_time("json error: %s, %s" % (task["url"],task["text"])) return if not j: log_with_time("bad response: %s" % task["url"]) return ret = [] for i in j: if int(i["localStock"]): stock = 1 else: stock = 0 qid = str(i["productId"]) ret.append((qid, task['price'][qid], stock)) result = format_price(ret) return result
def stock_parser(task, rule): try: j = json.loads(task["text"]) except ValueError: log_with_time("json error: %s, %s" % (task["url"], task["text"])) return if not j: log_with_time("bad response: %s" % task["url"]) return ret = [] for i in j: if int(i["localStock"]): stock = 1 else: stock = 0 qid = str(i["productId"]) ret.append((qid, task['price'][qid], stock)) result = format_price(ret) return result
def list_parser(task, rule): t = etree.HTML(task['text'].decode('gbk', 'replace')) nodes = t.xpath(rule['nodes']) ret = [] dps = {} for node in nodes: gid = node.xpath(rule['gid']) price = node.xpath(rule['price']) if not gid or not price: log_with_time("bad rules: %r" % task['url']) continue gid = gid[0] price = price[0].text price = re_price.search(price).group() ret.append((str(itemurl+gid), str(price), 1)) fret = format_price(ret) for i in fret: dps[i[1]] = int(time.time()) return {"result":fret, "dps":dps}
def stock_parser(task, rule): try: j = json.loads(task['text']) stockstatus = j['glist'][0]['stockstatus'] except: log_with_time("bad response %r"%task['url']) return [] if not stockstatus or '无' in stockstatus.encode("utf8"): stock = 0 else: stock = 1 code = re.search("(?<=gcodes=)\d+", task['url']).group() ret = [(itemburl%code, task['price'], stock)] fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return {"result": fret, "dps": dps}
def list_parser(task, rule): t = etree.HTML(task['text']) nodes = t.xpath(rule['nodes']) ret = [] dps = {} for node in nodes: gid = node.attrib["goodsid"] price = node.attrib['price'][1:] ostock = node.xpath(rule['ostock']) if not gid or not price: log_with_time("bad response %r"%task['url']) continue if not ostock: stock = 1 else: stock = 0 ret.append((str(gid_html%gid), str(price), stock)) dps[gid] = int(time.time()) fret = format_price(ret) return {"result":fret, "dps": dps}
def list_parser(task, rule): t = etree.HTML(task['text']) nodes = t.xpath(rule['nodes']) ret = [] dps = {} for node in nodes: gid = node.attrib["goodsid"] price = node.attrib['price'][1:] ostock = node.xpath(rule['ostock']) if not gid or not price: log_with_time("bad response %r" % task['url']) continue if not ostock: stock = 1 else: stock = 0 ret.append((str(gid_html % gid), str(price), stock)) dps[gid] = int(time.time()) fret = format_price(ret) return {"result": fret, "dps": dps}
def stock_parser(task, rule): try: j = json.loads(task['text']) stockstatus = j['glist'][0]['stockstatus'] except: log_with_time("bad response %r" % task['url']) return [] if not stockstatus or '无' in stockstatus.encode("utf8"): stock = 0 else: stock = 1 code = re.search("(?<=gcodes=)\d+", task['url']).group() ret = [(itemburl % code, task['price'], stock)] fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return {"result": fret, "dps": dps}
def stock_parser(task, rule): try: stock = jsonp_json(task["text"].decode("gbk")) except ValueError as e: log_with_time("stock_parser: jsonp_json: %s" % task["text"]) return stocks = {} for key,value in stock.items(): s = value["StockStateName"] if u"现货" in s or u"有货" in s: stocks[key] = 1 else: stocks[key] = 0 ret = [] prices = task["price"] for key,price in prices.items(): if key not in stocks: log_with_time("key not in stocks: %s" % task["url"]) continue ret.append((key, price, stocks[key])) result = format_price(ret) return result
def list_parser(task, rule): try: t = etree.HTML(task["text"].decode('gb18030')) except: traceback.print_exc() return nodes = t.xpath(rule["node"]) ret = list() for node in nodes: link = node.xpath(rule["link"]) price = node.xpath(rule["price"]) stock = 1 if not link or not price: log_with_time("rule error: %s" % task["url"]) continue ret.append((str(link[0]), str(price[0][1:]), int(stock))) print(str(link[0]), str(price[0][1:]), int(stock)) result = format_price(ret) return result
def stock_parser(task, rule): try: stock = jsonp_json(task["text"].decode("gbk")) except ValueError as e: log_with_time("stock_parser: jsonp_json: %s" % task["text"]) return stocks = {} for key, value in stock.items(): s = value["StockStateName"] if u"现货" in s or u"有货" in s: stocks[key] = 1 else: stocks[key] = 0 ret = [] prices = task["price"] for key, price in prices.items(): if key not in stocks: log_with_time("key not in stocks: %s" % task["url"]) continue ret.append((key, price, stocks[key])) result = format_price(ret) return result
def list_parser(task, rule): try: t = etree.HTML(task["text"].decode('gb18030')) except: traceback.print_exc() return nodes = t.xpath(rule["node"]) ret = list() for node in nodes: link = node.xpath(rule["link"]) price = node.xpath(rule["price"]) stock = 1 if not link or not price: log_with_time("rule error: %s" % task["url"]) continue ret.append((str(link[0]), str(price[0][1:]), int(stock)) ) print (str(link[0]), str(price[0][1:]), int(stock)) result = format_price(ret) return result
def list_parser(task, rule): try: t = etree.HTML(task["text"]) except: traceback.print_exc() return ret = [] nodes = t.xpath(rule["node"]) #print etree.tostring(nodes[0]) #print len(nodes) for node in nodes: link = node.xpath(rule["link"]) price = node.xpath(rule["price"]) stock = "none" in str(node.xpath(rule["stock"])[0]) if not link or not price: log_with_time("rule error: %s" % task["url"]) continue ret.append((str(link[0]), str(price[0][1:]), int(stock))) #print (str(link[0]), str(price[0][1:]), int(stock)) result = format_price(ret) return result