def generate_tasks(self): self.clear_redis() ii = ItemIndex(self.date) allids = set() cates = ii.getcates() cates.extend(list(set([(c[0], 'all') for c in cates]))) for cate1, cate2 in cates: ids1 = ii.gettopitemids(cate1, cate2, 'sales', 'mon') ids2 = ii.gettopitemids(cate1, cate2, 'sales', 'day') ids = (set(ids1) | set(ids2)) - allids allids.update(ids) if ids: self.add_task('aggregator.iteminfo.save_iteminfos', self.date, *ids) self.finish_generation()
def top10_items(date=None): if date is None: date = defaultdate ii = ItemIndex(date) top10 = Counter() itemdeals = Counter() for cate1 in topcids: for itemid, sales in ii.getindex(cate1, 'all', 'sales', 'mon'): top10[itemid] = sales itemdeals[itemid] = int(ii.getinfo(itemid, cate1).get('deals_mon', 0)) top10item = [] for itemid, sales in top10.most_common(10): deals = itemdeals[itemid] top10item.append((int(itemid), deals, sales)) db.execute('''insert into ataobao2.top10 (datestr, field, value) values (:datestr, :field, :value)''', dict(datestr=date, field='item', value=json.dumps(top10item)))
def save_iteminfos(date, *itemids): ii = ItemIndex(date) for itemid in itemids: try: save_iteminfo(date, ii, itemid) except: traceback.print_exc()
def top10_items(date=None): if date is None: date = defaultdate ii = ItemIndex(date) top10 = Counter() itemdeals = Counter() for cate1 in topcids: for itemid, sales in ii.getindex(cate1, 'all', 'sales', 'mon'): top10[itemid] = sales itemdeals[itemid] = int( ii.getinfo(itemid, cate1).get('deals_mon', 0)) top10item = [] for itemid, sales in top10.most_common(10): deals = itemdeals[itemid] top10item.append((int(itemid), deals, sales)) db.execute( '''insert into ataobao2.top10 (datestr, field, value) values (:datestr, :field, :value)''', dict(datestr=date, field='item', value=json.dumps(top10item)))
def aggregate_items(start, end, hosts=[], date=None, retry=0): if retry >= 20: raise Exception('retry too many times, give up') if start > end: aggregate_items(start, 2**63-1, hosts, date, retry) aggregate_items(-2**63, end, hosts, date, retry) try: db = getdb() if date is None: date = defaultdate datestr = date date2 = datetime.strptime(date, "%Y-%m-%d")+timedelta(hours=16) date1 = date2 - timedelta(days=60) si = ShopIndex(date) ii = ItemIndex(date) bi = BrandIndex(date) ci = CategoryIndex(date) si.multi() ii.multi() bi.multi() ci.multi() try: if hosts: d2 = calendar.timegm(date2.utctimetuple())*1000 d1 = calendar.timegm(date1.utctimetuple())*1000 host = hosts[0] conn = db.get_connection(host) cur = conn.cursor() cur.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end))) iteminfos = list(cur) cur.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2)) itemts = list(cur) conn.close() else: iteminfos = db.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end)), result=True).results itemts = db.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2), result=True).results except: print('cluster error on host {}, range {}, retry {}, sleeping 5 secs...'.format(hosts[0], (start, end), retry)) hosts = hosts[-1:] + hosts[:-1] #traceback.print_exc() time.sleep(30) return aggregate_items(start, end, date=date, hosts=hosts, retry=retry+1) itemtsdict = {} for row in itemts: itemid, date, values = row[0], row[1], list(row[2:]) # fix data malform # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million if values[0] > 2**24: values[0] = 0 if isinstance(date, datetime): date = (date+timedelta(hours=8)).strftime("%Y-%m-%d") else: date = datetime.utcfromtimestamp(struct.unpack('!q', date)[0]/1000) date = (date+timedelta(hours=8)).strftime("%Y-%m-%d") if itemid not in itemtsdict: itemtsdict[itemid] = {} itemtsdict[itemid][date] = values for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos: if in_blacklist(shopid, price, cid, nc, nr, credit_score, title, type, itemid=itemid): #print itemid, 'skiped' continue brand = clean_brand(brand) if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]: try: if shopid == 0: db.execute('delete from ataobao2.item where id=:id', dict(id=itemid)) db.execute('delete from ataobao2.item_by_date where id=:id', dict(id=itemid)) continue except: traceback.print_exc() try: aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid], shopid, cid, price, brand, name, image, datestr) except: traceback.print_exc() si.execute() bi.execute() ci.execute() ii.execute() except: traceback.print_exc()
def aggregate_items(start, end, hosts=[], date=None, retry=0): if retry >= 20: raise Exception('retry too many times, give up') if start > end: aggregate_items(start, 2**63 - 1, hosts, date, retry) aggregate_items(-2**63, end, hosts, date, retry) try: db = getdb() if date is None: date = defaultdate datestr = date date2 = datetime.strptime(date, "%Y-%m-%d") + timedelta(hours=16) date1 = date2 - timedelta(days=60) si = ShopIndex(date) ii = ItemIndex(date) bi = BrandIndex(date) ci = CategoryIndex(date) si.multi() ii.multi() bi.multi() ci.multi() try: if hosts: d2 = calendar.timegm(date2.utctimetuple()) * 1000 d1 = calendar.timegm(date1.utctimetuple()) * 1000 host = hosts[0] conn = db.get_connection(host) cur = conn.cursor() cur.execute( '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end))) iteminfos = list(cur) cur.execute( '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2)) itemts = list(cur) conn.close() else: iteminfos = db.execute( '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end)), result=True).results itemts = db.execute( '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2), result=True).results except: print( 'cluster error on host {}, range {}, retry {}, sleeping 5 secs...' .format(hosts[0], (start, end), retry)) hosts = hosts[-1:] + hosts[:-1] #traceback.print_exc() time.sleep(30) return aggregate_items(start, end, date=date, hosts=hosts, retry=retry + 1) itemtsdict = {} for row in itemts: itemid, date, values = row[0], row[1], list(row[2:]) # fix data malform # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million if values[0] > 2**24: values[0] = 0 if isinstance(date, datetime): date = (date + timedelta(hours=8)).strftime("%Y-%m-%d") else: date = datetime.utcfromtimestamp( struct.unpack('!q', date)[0] / 1000) date = (date + timedelta(hours=8)).strftime("%Y-%m-%d") if itemid not in itemtsdict: itemtsdict[itemid] = {} itemtsdict[itemid][date] = values for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos: if in_blacklist(shopid, price, cid, nc, nr, credit_score, title, type, itemid=itemid): #print itemid, 'skiped' continue brand = clean_brand(brand) if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]: try: if shopid == 0: db.execute('delete from ataobao2.item where id=:id', dict(id=itemid)) db.execute( 'delete from ataobao2.item_by_date where id=:id', dict(id=itemid)) continue except: traceback.print_exc() try: aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid], shopid, cid, price, brand, name, image, datestr) except: traceback.print_exc() si.execute() bi.execute() ci.execute() ii.execute() except: traceback.print_exc()