def process_detailpage(self): prod_url = self.response.meta["prod_url"] prod_name = self.response.meta["prod_name"] cat = self.response.meta["cat"] uid = get_uid(self.response.url) image = Image.open(StringIO(self.response.body)) image_file = "%s/%s.%s" % (self.tmpfile_dir, uid, image.format.lower()) image.save(image_file) price = gocr(image_file) log.msg("save image:%s, url:%s, price:%s" % (image_file, self.response.url, price)) self.save(prod_url, prod_name, cat, price) return 0
def __init__(self, cur_idepth, max_idepth, \ cur_xdepth, max_xdepth, content_group, \ pl_group, source, url): self.cur_idepth = cur_idepth self.max_idepth = max_idepth self.cur_xdepth = cur_xdepth self.max_xdepth = max_xdepth self.content_group = content_group self.pl_group = pl_group self.source = source self.url = url self.uid = get_uid(url) self.domain = get_domain(url) self.host = urlparse(self.url).hostname
def get(self, url, collection_name=None): uid = get_uid(url) dbrecord = self.dbclient.find_one(uid, collection_name) if dbrecord: item = GoodsItem() item.oid = str(dbrecord["_id"]) item.url = dbrecord["url"] item.uid = dbrecord["uid"] item.name = dbrecord["name"] item.cat = dbrecord["cat"] item.data = dbrecord["data"] item.bottom_price = dbrecord["bottom_price"] item.domain = dbrecord["domain"] return item else: return None
def process(self, item): if not isinstance(item, GoodsItem): log.msg('expect a GoodsItem instance, got %s' % type(item)) return if len(item.data) < 2: return #compare the latest and second latest price discount = float(item.data[-1][0])/item.data[-2][0] if discount > self.accept_discount: return price = item.data[-1][0] recipients = self.rule.get(get_uid(item.url), price, discount) subject = 'Big Promotion[$title]' content = "$title is now ¥%s, discount %s, %s" % (price, discount, item.url) self.mail.send(recipients, subject, content)
def put(self, url, name, cat, price, collection_name=None): uid = get_uid(url) domain = get_domain(url) crawl_time = int(time.time()) item = self.get(url, collection_name) if item: if item.add_price(price, crawl_time): self.dbclient.update_field(uid, collection_name, data=item.data, bottom_price=item.bottom_price) else: log.msg("duplicate price") return else: item = { "url": url, "uid": uid, "name": name, "cat": cat, "data": [(price, crawl_time)], "bottom_price": (price, crawl_time), "domain": domain, } self.dbclient.insert(item, uid, collection_name) send_catch_log(signal=signals.item_saved, item=item)