def __init__(self, cur_idepth, max_idepth, \ cur_xdepth, max_xdepth, content_group, \ pl_group, source, url): self.cur_idepth = cur_idepth self.max_idepth = max_idepth self.cur_xdepth = cur_xdepth self.max_xdepth = max_xdepth self.content_group = content_group self.pl_group = pl_group self.source = source self.url = url self.uid = get_uid(url) self.domain = get_domain(url) self.host = urlparse(self.url).hostname
def put(self, url, name, cat, price, collection_name=None): uid = get_uid(url) domain = get_domain(url) crawl_time = int(time.time()) item = self.get(url, collection_name) if item: if item.add_price(price, crawl_time): self.dbclient.update_field(uid, collection_name, data=item.data, bottom_price=item.bottom_price) else: log.msg("duplicate price") return else: item = { "url": url, "uid": uid, "name": name, "cat": cat, "data": [(price, crawl_time)], "bottom_price": (price, crawl_time), "domain": domain, } self.dbclient.insert(item, uid, collection_name) send_catch_log(signal=signals.item_saved, item=item)
def save_extract_info(self, url, item_num): if item_num > 0: domain = get_domain(url) self.redis.hincrby(self.genkey(self.EXTRACT_ITEM_OK), domain, item_num) else: self.redis.sadd(self.genkey(self.EXTRACT_ITEM_FAIL), url)