Exemple #1
0
 def __init__(self, cur_idepth, max_idepth, \
         cur_xdepth, max_xdepth, content_group, \
         pl_group, source, url):
     self.cur_idepth = cur_idepth
     self.max_idepth = max_idepth
     self.cur_xdepth = cur_xdepth
     self.max_xdepth = max_xdepth
     self.content_group = content_group
     self.pl_group = pl_group
     self.source = source
     self.url = url
     self.uid = get_uid(url)
     self.domain = get_domain(url) 
     self.host = urlparse(self.url).hostname
Exemple #2
0
    def put(self, url, name, cat, price, collection_name=None):
        uid = get_uid(url)
        domain = get_domain(url)
        crawl_time = int(time.time())
        item = self.get(url, collection_name)
        if item:
            if item.add_price(price, crawl_time):
                self.dbclient.update_field(uid, collection_name, data=item.data, bottom_price=item.bottom_price)
            else:
                log.msg("duplicate price")
                return
        else:
            item = {
                "url": url,
                "uid": uid,
                "name": name,
                "cat": cat,
                "data": [(price, crawl_time)],
                "bottom_price": (price, crawl_time),
                "domain": domain,
            }
            self.dbclient.insert(item, uid, collection_name)

        send_catch_log(signal=signals.item_saved, item=item)
Exemple #3
0
 def save_extract_info(self, url, item_num):
     if item_num > 0:
         domain = get_domain(url)
         self.redis.hincrby(self.genkey(self.EXTRACT_ITEM_OK), domain, item_num)
     else:
         self.redis.sadd(self.genkey(self.EXTRACT_ITEM_FAIL), url)