def list_parser(task): rule = task["rule"] content = task["recv"].getvalue() ret = [] dyn_items = re.findall('({.*?sell_price.*?}),', content) for i in dyn_items: try: item = json.loads(i) except ValueError: continue link = item_base % item["id"] price = item["sell_price"] ret.append((link, price, 1)) t = etree.HTML(content) nodes = t.xpath(rule["rule"]) for node in nodes: link = node.xpath("div[@class = 'cat-item-pic']/a/@href") price = node.xpath( "figcaption[@class = 'cat-item-inf']/p/span[@class = 'cat-pire-nub']/text()" ) if not link or not price: log_with_time("rule error: %s" % task["url"]) continue ret.append((link[0], price[0], 1)) result = format_price(ret) return result
def list_parser(task, rule): t = etree.HTML(task["text"]) nodes = t.xpath(rule["node"]) if not nodes: log_with_time("node rule error: %s" % task["url"]) return dp = [] dps = {} ret = [] now = int(time.time()) for node in nodes: link = node.xpath(rule["link"]) gid = node.xpath(rule["gid"]) if not link or not gid: log_with_time("rule error: %s" % task["url"]) continue gid = gid[0] dp.append((link[0], "")) ret.append(gid) dps[gid] = now return { "dps_log": dps, "dp": dp, "price": ret, }
def stock1_parser(task, rule): try: j = demjson.decode(task['text']) except: log_with_time("bad response: %r"%task['url']) return [] code = j['code'] message = j['message'] url = "" ret = {"spider":[], 'stock2':[]} if code == 3 and message: try: skuid = re.search("\d+", message).group() url = surl2(task['gid'], skuid) except: return [] if url == "": #print(task['text']) stock = 1 if j.get('totalAmount') else 0 ret['spider'] = format_price([(itemurl+task['gid'], task['price'], stock)]) else: ret['stock2'] = [(url, task['gid'], task['price'])] return ret
def pager(task): rule = task["rule"] content = task["recv"].getvalue() t = etree.HTML(content) total = t.xpath(rule["rule"]) if "找到0个商品" in content: log_with_time("search result 0: %s" % task["url"]) return if not total: log_with_time("rule error: %s" % rule["rule"]) return total = int(total[0]) page = rule["page"] num = total / page if total % page: num += 1 tasks = [] for i in range(num): tasks.append({ "url": task["url"] + "&order=normal&sort=desc&per_page=%s" % (i * page), "old_url": task["url"], }) return tasks
def rt_parser(items): pids = get_pids(items) if not pids: log_with_time("got nothing: %s" % entries) return purl = price_url % (",".join(["J_" + i for i in pids]), random.randint(1000000, 10000000), int(time.time() * 1000)) surl = stock_url % (async_http.quote(",".join([i for i in pids])), random.randint(1000000, 10000000), int(time.time() * 1000)) price_res = simple_http.get(purl) stock_res = simple_http.get(surl) if price_res["status"] != 200 or stock_res["status"] != 200: log_with_time("not200: %s" % price["res"]) return try: price_json = jsonp_json(price_res["text"]) stock_json = jsonp_json(stock_res["text"].decode("gbk")) except: traceback.print_exc() return prices = {} for i in price_json: prices[i["id"].split("_")[1]] = i["p"] stocks = {} for k,v in stock_json.items(): s = v["StockStateName"] if u"有货" in s or u"现货" in s: stocks[k] = 1 else: stocks[k] = 0 ret = [] for pid in prices: ret.append((str(pid), str(prices[pid]), stocks[pid])) return format_price(ret)
def cats_parser(url, res, rule): content = res['text'] t = etree.HTML(content) ret = set() items = t.xpath(rule) for v in items: #pdb.set_trace() if '/c0-0/' in v: continue if '/ctg/s2/' in v: r = "(?<=/ctg/s2/).+" cat = re.search(r, v) if not cat: log_with_time("bad regex: %r %r" % (r, v)) continue cat = cat.group().split('-')[0] ret.add(ctgurl % cat) elif 'list.yhd.com' in v: # http://list.yhd.com/.../ r = "(?<=yhd\.com\/).+" cat = re.search(r, v) if not cat: log_with_time("bad regex: %r %r" % (r, v)) continue cat = cat.group().split('-')[0] ret.add(lsturl % cat) return ret
def list_parser(task, rule): #pr.enable() try: t = etree.HTML(task['text'].decode('gbk', 'replace')) except: log_with_time("bad response %s"%task['url']) return ret = [] if ebookurl in task['url']: r = __book_list_parser1(t, task, rule) elif 'cp' in task['url']: r = __book_list_parser2(t, task, rule) else: r = __norm_list_parser1(t, task, rule) ret, comments, shop, dp = r #pr.disable() #pr.print_stats() fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return { "result": fret, "dps": dps, "shop": shop, "comment": comments, "dp": dp, }
def dp_parser(task, rule): desc_url = re.findall("desc: '(http.*?desc/[0-9]+)'", task["text"]) if not desc_url: log_with_time("no desc: %s" % task["url"]) return crc = urlcrc.get_urlcrc(3, task["url"]) return [(desc_url[0], str(crc), "")]
def pager(task, rule): try: tree = etree.HTML(task["text"]) except: log_with_time("bad response %s" % task['url']) return page = re.findall("/([0-9]+)", " ".join(tree.xpath(rule))) if not page: log_with_time("page rule error") return cat = re.findall("/(v?c[0-9]+-[0-9]+-[0-9]+)", task["url"])[0] ret = [] for i in range(1, int(page[0]) + 1): if cat.startswith("vc"): url_a = virta_base.format(cat=cat, page=i, cb=int(time.time() * 1000)) url_b = virtb_base.format(cat=cat, page=i, cb=int(time.time() * 1000)) else: url_a = parta_base.format(cat=cat, page=i, cb=int(time.time() * 1000)) url_b = partb_base.format(cat=cat, page=i, cb=int(time.time() * 1000)) ret.append(url_a) ret.append(url_b) return ret
def list_parser(task, rule): t = etree.HTML(task['text']) nodes = t.xpath(rule['nodes']) ret = [] dps = {} for node in nodes: gid = node.xpath(rule['gid']) price = node.xpath(rule['price']) if not gid or not price: log_with_time("bad response: %r" % task['url']) continue gid = re_gid.search(gid[0]).group() price = re_price.search(price[0].text).group() ret.append({ "url": surl1, "gid": gid, "price": price, "payload": { "id": gid, "type": "0", "count": "1" } }) dps[gid] = time.time() return {"stock": ret, "dps": dps}
def list_parser(task, rule): t = etree.HTML(task['text']) nodes = t.xpath(rule['nodes']) prices = [] items = [] dps = {} #pdb.set_trace() for node in nodes: gid = node.attrib['itemid'] buyinfo = node.xpath(rule['buyinfo']) if not gid: log_with_time("bad response: %r"%task['url']) continue if buyinfo: buyinfo = buyinfo[0] buycart = buyinfo.xpath(rule['buycart']) stock = 1 if not buycart: if buyinfo.xpath(rule['sellout']) or not node.xpath(rule['comment']): stock = 0 prices.append((gid, stock)) else: items.append(gid) dps[gid] = int(time.time()) return {"prices": prices, "items": items, "dps": dps}
def extract_book(url, tree, rule): result = [] dps = [] now = int(time.time()) dps_log = {} nodes = tree.xpath(rule["book_node"]) comments = {} lid = re.search("\d+", url.split('-')[-1]).group() for node in nodes: link_node = node.xpath(rule["book_title"]) stock = node.xpath(rule["book_stock"]) comment = node.xpath(rule["book_comment"]) if not link_node or not stock: log_with_time("rule error: %s" % url) continue link_node = link_node[0] link = link_node.attrib["href"] gid = re_gid.search(link).group() comments[gid] = comment[0] title = link_node.text if u"有货" in stock[0]: s = 1 else: s = 0 dps_log[gid] = now dps.append((link, gid, title)) result.append((link, gid, lid, s)) return { "book_price": result, #"dp": dps, "dps_log": dps_log, "comment": comments }
def page_parser(task, rule): try: t = etree.HTML(task["text"]) except: traceback.print_exc() return tot = t.xpath(rule[0]) total = re.findall("/([0-9]+)", "".join(tot)) if not total: log_with_time("rule error: %s" % rule) return total = int(total[0]) if total == 1: return [{ "url": task['url'], }] bas = t.xpath(rule[1])[0] base = re.findall("(^.*page=?)\d*", bas)[0] tasks = [] for i in range(1, total + 1): tasks.append({ "url": "http://www.likeface.com" + base + str(i), }) return tasks
def list_parser(task, rule): t = etree.HTML(task['text']) nodes = t.xpath(rule['nodes']) ret = [] dps = {} dp = [] for node in nodes: gid = node.xpath(rule['gid']) if not gid: log_with_time("bad rules: %r" % task['url']) _gid = re.search("(?<=item/).+", gid[0]) if not _gid: _gid = re.search("(?<=com/).+", gid[0]) if not id: log_with_time("bad regex: %r" % task['url']) continue gid = _gid.group() dp.append((dp_base % gid, "")) ret.append(gid) return { "price": [ { "url": priceurl, "payload": { "itemid": ','.join(ret) } }], "dp": dp }
def fix_url(url): if "tuan" in url: log_with_time("skip url: %s" % url) return x = re.findall("/([0-9\-]+)\.", url) if not x: return return base + ",".join(x[0].split("-"))
def book_price(task, rule): try: j = json.loads(task['text']) price = j['price'][0]['proPrice'] #if j['price'] else 0 except: log_with_time("bad response: %s" % task['link']) return return format_price([[str(task['qid']), str(price), task['stock']]])
def price_parser(task, rule): try: price = re.search("(?<=price\:)\d+\.\d+(?=\,)", task['text']).group() except: log_with_time("bad response: %r"%task['url']) return [] ret = [(task['gid'], price, task['stock'])] fret = format_price(ret) return fret
def list_parser(task, rule): j = json.loads(task['text']) if not 'glist' in j: log_with_time("bad response %r"%task['url']) return [] ret = [] for g in j['glist']: ret.append((surl%g['gcode'], g['gprice'])) return ret
def item_parser(task, rule): try: t = etree.HTML(task['text']) btn = t.xpath(rule)[0] stock = 0 if btn.attrib.get('disabled') else 1 except: log_with_time("bad response: %s"%task['url']) return return [(task['gid'], stock)]
def list_parser(task, rule): j = json.loads(task['text']) if not 'glist' in j: log_with_time("bad response %r" % task['url']) return [] ret = [] for g in j['glist']: ret.append((surl % g['gcode'], g['gprice'])) return ret
def pager(task, rule): j = json.loads(task['text']) if not 'gpagecount' in j: log_with_time("bad response %r"%task['url']) return [] code = re.search("(?<=code=)\d+(?=&)", task['url']).group() ret = [] for i in range(1, j['gpagecount']+1): ret.append(gurl%(code,i)) return ret
def pager(task, rule): j = json.loads(task['text']) if not 'gpagecount' in j: log_with_time("bad response %r" % task['url']) return [] code = re.search("(?<=code=)\d+(?=&)", task['url']).group() ret = [] for i in range(1, j['gpagecount'] + 1): ret.append(gurl % (code, i)) return ret
def price_parser(task, rule): try: items = jsonp_json(task["text"]) except ValueError as e: log_with_time("price_parser: jsonp_json: %s" % task["text"]) return d = {} for item in items: d[item["id"].split("_")[1]] = item["p"] return [d]
def stock_parser(task, rule): try: j = json.loads(task['text']) stock = 1 if j['havestock'] in ("true", "realstock") else 0 except: log_with_time("bad response %s"%task['url']) return ret = [(itemurl % task['info'][0], str(task['info'][1]), stock)] fret = format_price(ret) return fret
def cats(url, res, rule): content = res["text"] try: t = etree.HTML(content) except: log_with_time("bad response %s" % content.decode("utf-8", "replace")) return ret = [] for i in t.xpath(rule): ret.append(yougou + i) return ret
def cats(url, res, rule): content = res['text'] try: t = etree.HTML(content) except: log_with_time("bad response %s"%content.decode('utf-8', 'replace')) return ret = [] for i in t.xpath(rule): ret.append(yougou + i) return ret
def pager(task, rule): t = etree.HTML(task["text"]) page = t.xpath(rule) if not page: log_with_time("page rule error") return ret = [task["url"]] num = re.findall("\d+", " ".join(page)) cat = re.findall("cateID=(\d+)", task["url"])[0] for i in range(2, int(num[0]) + 1): ret.append(page_base.format(cat=cat, page=i)) return ret
def checkoffline(task, rule): try: j = json.loads(task['text']) j = j['items'] except: log_with_time("bad response %s"%task['url']) return ret = [] for k,v in j.items(): if not v['is_found']: ret.append((str(k), str(-1), -1)) fret = format_price(ret) return fret
def meizhuang_cats_parser(url, content, rule): t = etree.HTML(content) ret = [] for node in t.xpath(rule[0]): #link link = node.xpath(rule[1]) #price price = node.xpath(rule[2]) if not link or not price: log_with_time("rule error: %s" % url) ret.append((link[0], price[0], 1)) result = format_price(ret) return result
def price_parser(task, rule): price = _tess.recognize(task['text'], _tess.IMAGE_PNG, 32) try: price = re.search("\d+\.\d+|\d+", price).group() except: log_with_time("bad price: %s" % task['url']) return ret = [(task['gid'], price, task['stock'])] fret = format_price(ret) dps = {} for i in fret: dps[i[1]] = int(time.time()) return {"result":fret, "dps": dps}
def list_parser(task, rule): t = etree.HTML(task['text']) nodes = t.xpath(rule['nodes']) ret = [] for node in nodes: gid = node.xpath(rule['gid']) price = node.xpath(rule['price']) if not gid or not price: log_with_time("bad response: %r" % task['url']) continue gid = re.findall("id-([0-9]+)", gid[0]) ret.append((gid[0], price[0])) return ret
def list_parser(task, rule): t = etree.HTML(task["text"]) nodes = t.xpath(rule["node"]) ret = [] for node in nodes: qid = node.attrib["id"].split("_") link = node.xpath(rule["link"]) price = node.xpath(rule["price"]) if not link or not price or not qid: log_with_time("rule error: %s" % task["url"]) continue ret.append(("http://www.miyabaobei.com" + link[0], qid[1], price[0])) return ret
def pager(task, rule): c = task["text"].decode("utf-8") item = jsonp_json(c) if "pageBar" not in item: log_with_time("no pageBar: %s" % task["url"]) return m = item["pageBar"] ret = [] if not m.get("totalCount", 0): log_with_time("empty category: %s" % task["url"]) return ret for i in range(1, m["totalPage"] + 1): ret.append({"url": payload(task['cat'], i)}) return ret
def stock_parser(task, rule): t = etree.HTML(task["text"]) stock = t.xpath(rule) ret = [] if not stock: log_with_time("bad response: %s" % task["url"]) return ret if int(stock[0].text): stock = 1 else: stock = 0 ret.append((task['url'], task['price'], stock)) fret = format_price(ret) return fret
def list_parser(task): t = etree.HTML(task["recv"].getvalue()) nodes = t.xpath(task["rule"]["rule"]) ret = [] for node in nodes: link = node.xpath("div/div[@class = 'proTit']/a/@href") price = node.xpath("div/div[@class = 'proPrice']/text()") if not link or not price: log_with_time("rule error: %s" % task["old_url"]) continue p = fix_price(price[0]) ret.append((link[0], p, 1)) result = format_price(ret) return result
def list_parser(task, rule): tree = etree.HTML(task["text"]) nodes = tree.xpath(rule["node"]) ret = [] for node in nodes: gidurl = node.xpath(rule['gidurl']) price = node.xpath(rule['price']) if not gidurl or not price: log_with_time("list parser err: %r" % task['url']) continue gidurl = gidurl[0] price = price[0].text[1:] ret.append((gidurl, price)) ret = list(set(ret)) return ret
def list_parser(task, rule): t = etree.HTML(task['text']) nodes = t.xpath(rule['nodes']) ret = [] for node in nodes: gid = node.xpath(rule['gid']) price = node.xpath(rule['price']) if not gid or not price: log_with_time("bad response: %r" % task['url']) continue gid = burl + gid[0] price = price[0].text price = re_price.search(price).group() ret.append((gid, price)) return ret
def list_parser(task, rule): import pdb t = etree.HTML(task["text"]) nodes = t.xpath(rule["node"]) ret = [] for node in nodes: link = node.xpath(rule["link"]) price = node.xpath(rule["price"]) if not link or not price: log_with_time("rule error: %s" % task["old_url"]) continue p = fix_price(price[0]) ret.append((str(link[0]), str(p), 1)) result = format_price(ret) return result
def promo_filter(item): url, sku = item parts =re.findall("/([A-Za-z-0-9]+)\.h", url) if not parts: log_with_time("url rule error: %s" % url) pid, sid = parts[0].split("-") #if "A" in url: # goodsNo = re.findall("([0-9]+)\.html", url)[0] #else: # goodsNo = sku p = promo_url.format(time = int(time.time() * 1000), goodsNo = sku, sid = sid, pid = pid) return { "url": p, "old": url }