def aggregate_brands(date, *brands): ci = CategoryIndex(date) bi = BrandIndex(date) try: ci.multi() bi.multi() for brand in brands: aggregate_brand(bi, ci, date, brand) ci.execute() bi.execute() except: traceback.print_exc()
def top10_brands(date=None): if date is None: date = defaultdate bi = BrandIndex(date) top10 = Counter() branddeals = Counter() for cate1 in topcids: for brand, sales in bi.getindex(cate1, 'all'): if brand == '无品牌': continue info = bi.getinfo(brand, cate1, 'all') branddeals[brand] += int(info.get('deals', 0)) top10[brand] += float(sales) top10 = top10.most_common(10) top10brand = [] for brand, sales in top10: deals = branddeals[brand] brand = brand.decode('utf-8') top10brand.append((brand, deals, sales)) r = db.execute('''insert into ataobao2.top10 (datestr, field, value) values (:datestr, :field, :value)''', dict(datestr=date, field='brand', value=json.dumps(top10brand)))
def es_brands(brands, date=None): try: if date is None: date = defaultdate bi = BrandIndex(date) for brand in brands: try: es_brand(bi, date, brand) except: traceback.print_exc() flush() except: traceback.print_exc()
def top10_brands(date=None): if date is None: date = defaultdate bi = BrandIndex(date) top10 = Counter() branddeals = Counter() for cate1 in topcids: for brand, sales in bi.getindex(cate1, 'all'): if brand == '无品牌': continue info = bi.getinfo(brand, cate1, 'all') branddeals[brand] += int(info.get('deals', 0)) top10[brand] += float(sales) top10 = top10.most_common(10) top10brand = [] for brand, sales in top10: deals = branddeals[brand] brand = brand.decode('utf-8') top10brand.append((brand, deals, sales)) r = db.execute( '''insert into ataobao2.top10 (datestr, field, value) values (:datestr, :field, :value)''', dict(datestr=date, field='brand', value=json.dumps(top10brand)))
def generate_tasks(self): self.clear_redis() bi = BrandIndex(self.date) ci = CategoryIndex(self.date) #from aggregator.brands import brands as brands1 #brands2 = set(b.decode('utf-8') for b in bi.getbrands()) #brands = list(brands1 & brands2) allbrands = set() for cate1, cate2 in l1l2s: brands1 = ci.getbrandnames(cate1, cate2) brands = list(brands1 - allbrands) allbrands.update(brands1) for i in range(1 + len(brands) / self.step): bs = brands[i * self.step:(i + 1) * self.step] self.add_task('aggregator.brandagg.aggregate_brands', self.date, *bs) self.finish_generation()
def aggregate_items(start, end, hosts=[], date=None, retry=0): if retry >= 20: raise Exception('retry too many times, give up') if start > end: aggregate_items(start, 2**63-1, hosts, date, retry) aggregate_items(-2**63, end, hosts, date, retry) try: db = getdb() if date is None: date = defaultdate datestr = date date2 = datetime.strptime(date, "%Y-%m-%d")+timedelta(hours=16) date1 = date2 - timedelta(days=60) si = ShopIndex(date) ii = ItemIndex(date) bi = BrandIndex(date) ci = CategoryIndex(date) si.multi() ii.multi() bi.multi() ci.multi() try: if hosts: d2 = calendar.timegm(date2.utctimetuple())*1000 d1 = calendar.timegm(date1.utctimetuple())*1000 host = hosts[0] conn = db.get_connection(host) cur = conn.cursor() cur.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end))) iteminfos = list(cur) cur.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2)) itemts = list(cur) conn.close() else: iteminfos = db.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end)), result=True).results itemts = db.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2), result=True).results except: print('cluster error on host {}, range {}, retry {}, sleeping 5 secs...'.format(hosts[0], (start, end), retry)) hosts = hosts[-1:] + hosts[:-1] #traceback.print_exc() time.sleep(30) return aggregate_items(start, end, date=date, hosts=hosts, retry=retry+1) itemtsdict = {} for row in itemts: itemid, date, values = row[0], row[1], list(row[2:]) # fix data malform # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million if values[0] > 2**24: values[0] = 0 if isinstance(date, datetime): date = (date+timedelta(hours=8)).strftime("%Y-%m-%d") else: date = datetime.utcfromtimestamp(struct.unpack('!q', date)[0]/1000) date = (date+timedelta(hours=8)).strftime("%Y-%m-%d") if itemid not in itemtsdict: itemtsdict[itemid] = {} itemtsdict[itemid][date] = values for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos: if in_blacklist(shopid, price, cid, nc, nr, credit_score, title, type, itemid=itemid): #print itemid, 'skiped' continue brand = clean_brand(brand) if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]: try: if shopid == 0: db.execute('delete from ataobao2.item where id=:id', dict(id=itemid)) db.execute('delete from ataobao2.item_by_date where id=:id', dict(id=itemid)) continue except: traceback.print_exc() try: aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid], shopid, cid, price, brand, name, image, datestr) except: traceback.print_exc() si.execute() bi.execute() ci.execute() ii.execute() except: traceback.print_exc()
def aggregate_items(start, end, hosts=[], date=None, retry=0): if retry >= 20: raise Exception('retry too many times, give up') if start > end: aggregate_items(start, 2**63 - 1, hosts, date, retry) aggregate_items(-2**63, end, hosts, date, retry) try: db = getdb() if date is None: date = defaultdate datestr = date date2 = datetime.strptime(date, "%Y-%m-%d") + timedelta(hours=16) date1 = date2 - timedelta(days=60) si = ShopIndex(date) ii = ItemIndex(date) bi = BrandIndex(date) ci = CategoryIndex(date) si.multi() ii.multi() bi.multi() ci.multi() try: if hosts: d2 = calendar.timegm(date2.utctimetuple()) * 1000 d1 = calendar.timegm(date1.utctimetuple()) * 1000 host = hosts[0] conn = db.get_connection(host) cur = conn.cursor() cur.execute( '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end))) iteminfos = list(cur) cur.execute( '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2)) itemts = list(cur) conn.close() else: iteminfos = db.execute( '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end)), result=True).results itemts = db.execute( '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2), result=True).results except: print( 'cluster error on host {}, range {}, retry {}, sleeping 5 secs...' .format(hosts[0], (start, end), retry)) hosts = hosts[-1:] + hosts[:-1] #traceback.print_exc() time.sleep(30) return aggregate_items(start, end, date=date, hosts=hosts, retry=retry + 1) itemtsdict = {} for row in itemts: itemid, date, values = row[0], row[1], list(row[2:]) # fix data malform # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million if values[0] > 2**24: values[0] = 0 if isinstance(date, datetime): date = (date + timedelta(hours=8)).strftime("%Y-%m-%d") else: date = datetime.utcfromtimestamp( struct.unpack('!q', date)[0] / 1000) date = (date + timedelta(hours=8)).strftime("%Y-%m-%d") if itemid not in itemtsdict: itemtsdict[itemid] = {} itemtsdict[itemid][date] = values for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos: if in_blacklist(shopid, price, cid, nc, nr, credit_score, title, type, itemid=itemid): #print itemid, 'skiped' continue brand = clean_brand(brand) if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]: try: if shopid == 0: db.execute('delete from ataobao2.item where id=:id', dict(id=itemid)) db.execute( 'delete from ataobao2.item_by_date where id=:id', dict(id=itemid)) continue except: traceback.print_exc() try: aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid], shopid, cid, price, brand, name, image, datestr) except: traceback.print_exc() si.execute() bi.execute() ci.execute() ii.execute() except: traceback.print_exc()