def aggregate_categories(date=None): if date is None: date = defaultdate ci = CategoryIndex(date) si = ShopIndex(date) ci.multi() cates = list(l1l2s) for cate1, cate2 in cates: info = {} if cate2 != 'all': r = getdb().execute('select search_index from ataobao2.cate where id=:id', dict(id=cate2), result=True) if r and r.results: info['search_index'] = r.results[0][0] else: info['search_index'] = 0 for mod in ['mon', 'day']: info.update({ 'shops': si.getshops(cate1, cate2), 'brands': ci.getbrands(cate1, cate2), }) info.update(ci.getinfo(cate1, cate2, mod)) print cate1, cate2, mod, info for field in ['deals', 'items', 'sales', 'delta_sales']: if field not in info: info[field] = 0 ci.setinfo(cate1, cate2, mod, info) ci.setindex(cate1, cate2, 'sales', mod, info.get('sales', 0)) ci.execute()
def aggregate_shops(start, end, date=None): try: db = getdb() if date is None: date = defaultdate si = ShopIndex(date) ci = CategoryIndex(date) si.multi() ci.multi() shopids = set() with db.connection() as cur: cur.execute('''select id, title, logo, type, credit_score, num_products, good_rating, num_collects from ataobao2.shop where token(id)>=:start and token(id)<:end''', dict(start=start, end=end), consistency_level='ONE') for row in cur: shopid, name, logo, type, credit_score, num_products, good_rating, num_collects = row shopids.add(shopid) try: aggregate_shop(si, ci, shopid, name, logo, type, credit_score, num_products, good_rating, num_collects) except: traceback.print_exc() si.allshopids.add(*shopids) si.execute() ci.execute() except: traceback.print_exc()
def aggregate_brand(bi, ci, date, brand): db = getdb() if brand == '': brand = '无品牌' baseinfo = {} def update_with_cates(cate1, cate2): brandinfo = bi.getinfo(brand, cate1, cate2) sales = float(brandinfo.get('sales', 0)) if brand != '无品牌': bi.setindex(brand, cate1, cate2, sales) categoryinfo = ci.getinfo(cate1, cate2, 'mon') try: share = sales/float(categoryinfo['sales']) except: share = 0 num_shops = bi.getshops(brand, cate1, cate2) or 0 bi.setinfo(brand, cate1, cate2, {'share':share, 'shops': num_shops}) if cate2 == 'all': db.execute('''insert into ataobao2.brand_by_date (name, datestr, cate1, sales, share, num_shops) values (:name, :datestr, :cate1, :sales, :share, :num_shops)''', dict(name=brand.decode('utf-8'), datestr=date, cate1=cate1, sales=sales, share=share, num_shops=num_shops)) # update info & index cates = bi.getcates(brand) for cate1, cate2 in cates: update_with_cates(cate1, cate2)
def aggregate_shops(start, end, date=None): try: db = getdb() if date is None: date = defaultdate si = ShopIndex(date) ci = CategoryIndex(date) si.multi() ci.multi() shopids = set() with db.connection() as cur: cur.execute( '''select id, title, logo, type, credit_score, num_products, good_rating, num_collects from ataobao2.shop where token(id)>=:start and token(id)<:end''', dict(start=start, end=end), consistency_level='ONE') for row in cur: shopid, name, logo, type, credit_score, num_products, good_rating, num_collects = row shopids.add(shopid) try: aggregate_shop(si, ci, shopid, name, logo, type, credit_score, num_products, good_rating, num_collects) except: traceback.print_exc() si.allshopids.add(*shopids) si.execute() ci.execute() except: traceback.print_exc()
def _getconn(date): conns = [] db = getdb() r = db.execute('''select datestr, hosts, ready from ataobao2.agghosts where datestr=:date''', dict(date=date), result=True) if not r.results: now = datetime.utcnow().date() dt = datetime.strptime(date, '%Y-%m-%d').date() dates = [(now-timedelta(days=days)).strftime('%Y-%m-%d') for days in range(14+(now-dt).days)] dates = '('+','.join(["'"+d+"'" for d in dates])+')' r = db.execute('''select datestr, hosts, ready from ataobao2.agghosts where datestr in {dates}'''.format(dates=dates), result=True) alluris = [ [ds, hosts, ready] for ds, hosts, ready in sorted(r.results) if ready ] if not alluris: uris = AGGRE_URIS[0] else: used_uris = json.loads(alluris[-1][1]) for au in AGGRE_URIS: if set(au) != set(used_uris): uris = au break else: uris = used_uris db.execute('''insert into ataobao2.agghosts (datestr, hosts) values (:date, :hosts)''', dict(date=date, hosts=json.dumps(uris))) else: uris = json.loads(r.results[0][1]) for uri in uris: host, port, dbn = re.compile('redis://(.*):(\d+)/(\d+)').search(uri).groups() conn = redis.Redis(host=host, port=int(port), db=int(dbn)) conns.append(conn) return ShardRedis(conns=conns)
def es_shop(si, date, shopid): db = getdb() shopinfo = si.getbase(shopid) num_products = int(shopinfo.get('num_products', 0)) credit_score = int(shopinfo.get('credit_score', 0)) or 1 good_rating = shopinfo.get('good_rating', '') title = shopinfo.get('name', '') logo = shopinfo.get('logo', '') type = shopinfo.get('type', '') worth = float(shopinfo.get('worth', 0)) cates = si.getcates(shopid) sales = 0 deals = 0 c1s = list(set([str(c[0]) for c in cates])) c2s = list(set([str(c[1]) for c in cates])) for c1 in c1s: info = si.getinfo(c1, 'all', 'mon', shopid) if info: sales += float(info.get('sales', 0)) deals += int(info.get('deals', 0)) items = si.gethotitems(shopid) or [] hot_items = [] if items and sales >= 10000 and credit_score >= 5: items = [int(id) for id in items] if len(items) == 1: r = db.execute( 'select id, image, num_sold30 from ataobao2.item where id=:id', dict(id=items[0]), result=True) else: r = db.execute( 'select id, image, num_sold30 from ataobao2.item where id in :ids', dict(ids=tuple(items[:4])), result=True) for row in r.results: itemid, image, num_sold30 = row hot_items.append({ 'itemid': itemid, 'deals': num_sold30, 'image': image }) info = { 'title': title, 'logo': logo, 'cate1': c1s, 'cate2': c2s, 'worth': worth, 'sales': sales, 'good_rating': good_rating, 'type': type, 'credit_score': credit_score, 'num_products': num_products, 'average_price': sales / deals if deals != 0 else 0, 'hot_items': json.dumps(hot_items), } index_shop(int(shopid), info)
def generate_tasks(self): self.clear_redis() conn = getdb().get() tclient = conn.client ring = tclient.describe_ring('ataobao2') conn.close() tokens = len(ring) tasks = defaultdict(list) v264 = 2**64 v263_1 = 2**63 - 1 step = v264 // self.step #slicepertoken = self.step/tokens for tokenrange in ring: ostart = int(tokenrange.start_token) oend = int(tokenrange.end_token) slicepertoken = (oend - ostart) // step if ostart < oend else ( v264 + oend - ostart) // step #step = (oend - ostart) // slicepertoken if ostart < oend else (v264+oend - ostart) // slicepertoken hosts = tokenrange.endpoints for i in range(slicepertoken): start = ostart + step * i end = start + step if start > v263_1: start -= v264 if end > v263_1: end -= v264 tasks[hosts[0]].append([ 'aggregator.itemagg.aggregate_items', (start, end), dict(date=self.date, hosts=hosts) ]) start = ostart + slicepertoken * step end = oend tasks[hosts[0]].append([ 'aggregator.itemagg.aggregate_items', (start, end), dict(date=self.date, hosts=hosts) ]) universe_tasks = [] # averaging tasks #for _ in range(40): # lenhosts = sorted([[len(tasks[host]), host] for host in tasks]) # delta = (lenhosts[-1][0] - lenhosts[0][0]) // 2 # if delta > 0 and len(tasks) > 1: # print 'inequality index', delta # for task in tasks[lenhosts[-1][1]][-delta:]: # task[2]['hosts'] = task[2]['hosts'][-1:] + task[2]['hosts'][:-1] # tasks[task[2]['hosts'][0]].insert(0, task) # tasks[lenhosts[-1][1]] = tasks[lenhosts[-1][1]][:-delta] while sum(map(len, tasks.itervalues())): for host in tasks: if tasks[host]: task = tasks[host].pop() universe_tasks.append(task) self.add_tasks(*universe_tasks) self.finish_generation()
def save_iteminfo(date, ii, itemid, retry=0): db = getdb() date2 = datetime.strptime(date, "%Y-%m-%d") + timedelta(hours=16) date1 = date2 - timedelta(days=60) r1 = db.execute( '''select title, image, shopid, brand, price, num_sold30, cid from ataobao2.item where id=:itemid''', dict(itemid=itemid), result=True) r2 = db.execute( '''select date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where id=:itemid and date>=:date1 and date<:date2''', dict(itemid=itemid, date1=date1, date2=date2), result=True) items = {(r[0] + timedelta(hours=8)).strftime("%Y-%m-%d"): r[1:] for r in r2.results} if not r1.results: from queues import ai1 ai1.put(itemid) return if retry < 10 and items == {}: print '....retry', retry + 1 return save_iteminfo(date, ii, itemid, retry + 1) if r1.results: name, image, shopid, brand, price, deals_mon, cid = r1.results[0] info = parse_iteminfo(date, itemid, items, price, cid) if info is None: print 'no result from parse_iteminfo', date, itemid, items, price, cid return cate1 = info['l1'] for cate2 in set([info['l2'], 'all']): ii.setinfo(itemid, { 'name': name, 'image': image, 'shopid': shopid, 'brand': brand, 'price': info['price'], 'sales_day': info['sales_day'], 'sales_mon': info['sales_mon'], 'deals_day': info['deals_day'], 'deals_mon': info['deals_mon'], 'delta_sales_mon': info['delta_sales_mon'], 'delta_sales_day': info['delta_sales_day'], }, skey='{}_{}'.format(cate1, cate2))
def es_shop(si, date, shopid): db = getdb() shopinfo = si.getbase(shopid) num_products = int(shopinfo.get('num_products', 0)) credit_score = int(shopinfo.get('credit_score', 0)) or 1 good_rating = shopinfo.get('good_rating', '') title = shopinfo.get('name', '') logo = shopinfo.get('logo', '') type = shopinfo.get('type', '') worth = float(shopinfo.get('worth', 0)) cates = si.getcates(shopid) sales = 0 deals = 0 c1s = list(set([str(c[0]) for c in cates])) c2s = list(set([str(c[1]) for c in cates])) for c1 in c1s: info = si.getinfo(c1, 'all', 'mon', shopid) if info: sales += float(info.get('sales', 0)) deals += int(info.get('deals', 0)) items = si.gethotitems(shopid) or [] hot_items = [] if items and sales >= 10000 and credit_score >= 5: items = [int(id) for id in items] if len(items) == 1: r = db.execute('select id, image, num_sold30 from ataobao2.item where id=:id', dict(id=items[0]), result=True) else: r = db.execute('select id, image, num_sold30 from ataobao2.item where id in :ids', dict(ids=tuple(items[:4])), result=True) for row in r.results: itemid, image, num_sold30 = row hot_items.append({'itemid':itemid, 'deals':num_sold30, 'image':image}) info = { 'title': title, 'logo': logo, 'cate1': c1s, 'cate2': c2s, 'worth': worth, 'sales': sales, 'good_rating': good_rating, 'type': type, 'credit_score': credit_score, 'num_products': num_products, 'average_price': sales/deals if deals !=0 else 0, 'hot_items': json.dumps(hot_items), } index_shop(int(shopid), info)
def generate_tasks(self): self.clear_redis() conn = getdb().get() tclient = conn.client ring = tclient.describe_ring('ataobao2') conn.close() tokens = len(ring) tasks = defaultdict(list) v264 = 2**64 v263_1 = 2**63-1 step = v264 // self.step #slicepertoken = self.step/tokens for tokenrange in ring: ostart = int(tokenrange.start_token) oend = int(tokenrange.end_token) slicepertoken = (oend - ostart) // step if ostart < oend else (v264+oend - ostart) // step #step = (oend - ostart) // slicepertoken if ostart < oend else (v264+oend - ostart) // slicepertoken hosts = tokenrange.endpoints for i in range(slicepertoken): start = ostart + step * i end = start + step if start > v263_1: start -= v264 if end > v263_1: end -= v264 tasks[hosts[0]].append(['aggregator.itemagg.aggregate_items', (start, end), dict(date=self.date, hosts=hosts)]) start = ostart + slicepertoken*step end = oend tasks[hosts[0]].append(['aggregator.itemagg.aggregate_items', (start, end), dict(date=self.date, hosts=hosts)]) universe_tasks = [] # averaging tasks #for _ in range(40): # lenhosts = sorted([[len(tasks[host]), host] for host in tasks]) # delta = (lenhosts[-1][0] - lenhosts[0][0]) // 2 # if delta > 0 and len(tasks) > 1: # print 'inequality index', delta # for task in tasks[lenhosts[-1][1]][-delta:]: # task[2]['hosts'] = task[2]['hosts'][-1:] + task[2]['hosts'][:-1] # tasks[task[2]['hosts'][0]].insert(0, task) # tasks[lenhosts[-1][1]] = tasks[lenhosts[-1][1]][:-delta] while sum(map(len, tasks.itervalues())): for host in tasks: if tasks[host]: task = tasks[host].pop() universe_tasks.append(task) self.add_tasks(*universe_tasks) self.finish_generation()
def es_brand(bi, date, brand): db = getdb() if brand == '': brand = '无品牌' d0 = (datetime.strptime(date, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d') cates = bi.getcates(brand) c1s = list(set([c[0] for c in cates])) c2s = list(set([c[1] for c in cates])) shops = items = deals = sales = delta = 0 for c1 in c1s: brandinfo = bi.getinfo(brand, c1, 'all') shops += int(brandinfo.get('shops', 0)) items += int(brandinfo.get('items', 0)) deals += int(brandinfo.get('deals', 0)) sales += float(brandinfo.get('sales', 0)) delta += float(brandinfo.get('delta_sales', 0)) # we don't care about brands that do not have more than 100 items if items < 100: return cate1 = [str(c) for c in c1s] cate2 = [str(c) for c in c2s] r = db.execute('select logo from ataobao2.brand where name=:name', dict(name=brand), result=True) try: logo = r.results[0][0] except: logo = '' info = { 'title': brand, 'cate1': cate1, 'cate2': cate2, 'logo': logo, 'shops': shops, 'items': items, 'deals': deals, 'sales': sales, 'delta': delta, } index_brand(brand, info)
def save_iteminfo(date, ii, itemid, retry=0): db = getdb() date2 = datetime.strptime(date, "%Y-%m-%d")+timedelta(hours=16) date1 = date2 - timedelta(days=60) r1 = db.execute('''select title, image, shopid, brand, price, num_sold30, cid from ataobao2.item where id=:itemid''', dict(itemid=itemid), result=True) r2 = db.execute('''select date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where id=:itemid and date>=:date1 and date<:date2''', dict(itemid=itemid, date1=date1, date2=date2), result=True) items = {(r[0]+timedelta(hours=8)).strftime("%Y-%m-%d"): r[1:] for r in r2.results} if not r1.results: from queues import ai1 ai1.put(itemid) return if retry < 10 and items == {}: print '....retry', retry+1 return save_iteminfo(date, ii, itemid, retry+1) if r1.results: name, image, shopid, brand, price, deals_mon, cid = r1.results[0] info = parse_iteminfo(date, itemid, items, price, cid) if info is None: print 'no result from parse_iteminfo', date, itemid, items, price, cid return cate1 = info['l1'] for cate2 in set([info['l2'], 'all']): ii.setinfo(itemid, { 'name': name, 'image': image, 'shopid': shopid, 'brand': brand, 'price': info['price'], 'sales_day': info['sales_day'], 'sales_mon': info['sales_mon'], 'deals_day': info['deals_day'], 'deals_mon': info['deals_mon'], 'delta_sales_mon': info['delta_sales_mon'], 'delta_sales_day': info['delta_sales_day'], }, skey='{}_{}'.format(cate1, cate2))
def _getconn(date): conns = [] db = getdb() r = db.execute('''select datestr, hosts, ready from ataobao2.agghosts where datestr=:date''', dict(date=date), result=True) if not r.results: now = datetime.utcnow().date() dt = datetime.strptime(date, '%Y-%m-%d').date() dates = [(now - timedelta(days=days)).strftime('%Y-%m-%d') for days in range(14 + (now - dt).days)] dates = '(' + ','.join(["'" + d + "'" for d in dates]) + ')' r = db.execute( '''select datestr, hosts, ready from ataobao2.agghosts where datestr in {dates}''' .format(dates=dates), result=True) alluris = [[ds, hosts, ready] for ds, hosts, ready in sorted(r.results) if ready] if not alluris: uris = AGGRE_URIS[0] else: used_uris = json.loads(alluris[-1][1]) for au in AGGRE_URIS: if set(au) != set(used_uris): uris = au break else: uris = used_uris db.execute( '''insert into ataobao2.agghosts (datestr, hosts) values (:date, :hosts)''', dict(date=date, hosts=json.dumps(uris))) else: uris = json.loads(r.results[0][1]) for uri in uris: host, port, dbn = re.compile('redis://(.*):(\d+)/(\d+)').search( uri).groups() conn = redis.Redis(host=host, port=int(port), db=int(dbn)) conns.append(conn) return ShardRedis(conns=conns)
def es_brand(bi, date, brand): db = getdb() if brand == "": brand = "无品牌" d0 = (datetime.strptime(date, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d") cates = bi.getcates(brand) c1s = list(set([c[0] for c in cates])) c2s = list(set([c[1] for c in cates])) shops = items = deals = sales = delta = 0 for c1 in c1s: brandinfo = bi.getinfo(brand, c1, "all") shops += int(brandinfo.get("shops", 0)) items += int(brandinfo.get("items", 0)) deals += int(brandinfo.get("deals", 0)) sales += float(brandinfo.get("sales", 0)) delta += float(brandinfo.get("delta_sales", 0)) # we don't care about brands that do not have more than 100 items if items < 100: return cate1 = [str(c) for c in c1s] cate2 = [str(c) for c in c2s] r = db.execute("select logo from ataobao2.brand where name=:name", dict(name=brand), result=True) try: logo = r.results[0][0] except: logo = "" info = { "title": brand, "cate1": cate1, "cate2": cate2, "logo": logo, "shops": shops, "items": items, "deals": deals, "sales": sales, "delta": delta, } index_brand(brand, info)
def clearall(date): db = getdb('db1') for p in all_processes: p.clear_redis() p.reset() db.execute('delete from ataobao2.agghosts where datestr=:date', dict(date=date)) clear_date(date) r = db.execute('select datestr, ready from ataobao2.agghosts', result=True) ahs = sorted(r.results) try: last = [ x[0] for x in ahs if x[1] ][-1] except: last = date for d, ready in ahs: if d < last: db.execute('delete from ataobao2.agghosts where datestr=:date', dict(date=d)) db.execute('delete from ataobao2.blacklist where type=\'shopblacknew\';')
def clearall(date): db = getdb('db1') for p in all_processes: p.clear_redis() p.reset() db.execute('delete from ataobao2.agghosts where datestr=:date', dict(date=date)) clear_date(date) r = db.execute('select datestr, ready from ataobao2.agghosts', result=True) ahs = sorted(r.results) try: last = [x[0] for x in ahs if x[1]][-1] except: last = date for d, ready in ahs: if d < last: db.execute('delete from ataobao2.agghosts where datestr=:date', dict(date=d)) db.execute('delete from ataobao2.blacklist where type=\'shopblacknew\';')
def aggregate_brand(bi, ci, date, brand): db = getdb() if brand == '': brand = '无品牌' baseinfo = {} def update_with_cates(cate1, cate2): brandinfo = bi.getinfo(brand, cate1, cate2) sales = float(brandinfo.get('sales', 0)) if brand != '无品牌': bi.setindex(brand, cate1, cate2, sales) categoryinfo = ci.getinfo(cate1, cate2, 'mon') try: share = sales / float(categoryinfo['sales']) except: share = 0 num_shops = bi.getshops(brand, cate1, cate2) or 0 bi.setinfo(brand, cate1, cate2, {'share': share, 'shops': num_shops}) if cate2 == 'all': db.execute( '''insert into ataobao2.brand_by_date (name, datestr, cate1, sales, share, num_shops) values (:name, :datestr, :cate1, :sales, :share, :num_shops)''', dict(name=brand.decode('utf-8'), datestr=date, cate1=cate1, sales=sales, share=share, num_shops=num_shops)) # update info & index cates = bi.getcates(brand) for cate1, cate2 in cates: update_with_cates(cate1, cate2)
def save_history_shop(si, date, shopid): db = getdb() shopinfo = si.getbase(shopid) if shopinfo: num_collects = int(shopinfo.get('num_collects', 0)) worth = float(shopinfo.get('worth', 0)) sales = 0 catetrend = {} cates = si.getcates(shopid) c1s = list(set([c[0] for c in cates])) for c1 in c1s: catetrend[c1] = {'rank': si.getrank(c1, 'all', shopid, 'day')} info = si.getinfo(c1, 'all', 'day', shopid) if info: catetrend[c1]['sales'] = float(info.get('sales', 0)) sales += catetrend[c1]['sales'] catetrend[c1]['deals'] = int(info.get('deals', 0)) brandshare = {} for mod in ['mon', 'day']: brandshare[mod] = {} info = si.getbrandinfo(shopid, 'sales', mod) dealsinfo = si.getbrandinfo(shopid, 'deals', mod) binfo = [(brand, float(value), int(dealsinfo.get(brand, 0))) for brand, value in info.iteritems() if float(value) > 0] total_sales = sum(sales for brand, sales, deals in binfo) if total_sales == 0: top10 = [] else: tops = sorted(binfo, key=itemgetter(1), reverse=True) other_sales = sum(sales for brand, sales, deals in tops[9:]) other_deals = sum(deals for brand, sales, deals in tops[9:]) tops = [(brand.decode('utf-8'), '{:4.2f}%'.format(sales * 100 / total_sales), sales, deals) for brand, sales, deals in tops[:9]] if other_sales > 0: tops.append( (u'其他', '{:4.2f}%'.format(other_sales * 100 / total_sales), other_sales, other_deals)) top10 = tops brandshare[mod] = top10 cateshare = {} for mod in ['mon', 'day']: cinfo = [] total_sales = 0 for cate1, cate2 in cates: info = si.getinfo(cate1, cate2, mod, shopid) if info and float(info.get('sales', 0)) > 0: total_sales += float(info.get('sales', 0)) cinfo.append( (cate2, float(info.get('sales', 0)), int(info.get('deals', 0)))) if total_sales == 0: top10 = [] else: tops = sorted(cinfo, key=itemgetter(1), reverse=True) other_sales = sum(sales for cate2, sales, deals in tops[9:]) other_deals = sum(deals for cate2, sales, deals in tops[9:]) tops = [(cate2, '{:4.2f}%'.format(sales * 100 / total_sales), sales, deals) for cate2, sales, deals in tops[:9]] if other_sales > 0: tops.append( (u'其他', '{:4.2f}'.format(other_sales * 100 / total_sales), other_sales, other_deals)) top10 = tops cateshare[mod] = top10 db.execute( '''insert into ataobao2.shop_by_date (id, datestr, worth, sales, num_collects, catetrend, brandshare, cateshare) values (:shopid, :datestr, :worth, :sales, :num_collects, :catetrend, :brandshare, :cateshare)''', dict(worth=worth, num_collects=num_collects, shopid=shopid, datestr=date, sales=total_sales, catetrend=json.dumps(catetrend), brandshare=json.dumps(brandshare), cateshare=json.dumps(cateshare)))
def aggregate_items(start, end, hosts=[], date=None, retry=0): if retry >= 20: raise Exception('retry too many times, give up') if start > end: aggregate_items(start, 2**63 - 1, hosts, date, retry) aggregate_items(-2**63, end, hosts, date, retry) try: db = getdb() if date is None: date = defaultdate datestr = date date2 = datetime.strptime(date, "%Y-%m-%d") + timedelta(hours=16) date1 = date2 - timedelta(days=60) si = ShopIndex(date) ii = ItemIndex(date) bi = BrandIndex(date) ci = CategoryIndex(date) si.multi() ii.multi() bi.multi() ci.multi() try: if hosts: d2 = calendar.timegm(date2.utctimetuple()) * 1000 d1 = calendar.timegm(date1.utctimetuple()) * 1000 host = hosts[0] conn = db.get_connection(host) cur = conn.cursor() cur.execute( '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end))) iteminfos = list(cur) cur.execute( '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2)) itemts = list(cur) conn.close() else: iteminfos = db.execute( '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end)), result=True).results itemts = db.execute( '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2), result=True).results except: print( 'cluster error on host {}, range {}, retry {}, sleeping 5 secs...' .format(hosts[0], (start, end), retry)) hosts = hosts[-1:] + hosts[:-1] #traceback.print_exc() time.sleep(30) return aggregate_items(start, end, date=date, hosts=hosts, retry=retry + 1) itemtsdict = {} for row in itemts: itemid, date, values = row[0], row[1], list(row[2:]) # fix data malform # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million if values[0] > 2**24: values[0] = 0 if isinstance(date, datetime): date = (date + timedelta(hours=8)).strftime("%Y-%m-%d") else: date = datetime.utcfromtimestamp( struct.unpack('!q', date)[0] / 1000) date = (date + timedelta(hours=8)).strftime("%Y-%m-%d") if itemid not in itemtsdict: itemtsdict[itemid] = {} itemtsdict[itemid][date] = values for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos: if in_blacklist(shopid, price, cid, nc, nr, credit_score, title, type, itemid=itemid): #print itemid, 'skiped' continue brand = clean_brand(brand) if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]: try: if shopid == 0: db.execute('delete from ataobao2.item where id=:id', dict(id=itemid)) db.execute( 'delete from ataobao2.item_by_date where id=:id', dict(id=itemid)) continue except: traceback.print_exc() try: aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid], shopid, cid, price, brand, name, image, datestr) except: traceback.print_exc() si.execute() bi.execute() ci.execute() ii.execute() except: traceback.print_exc()
def aggregate_items(start, end, hosts=[], date=None, retry=0): if retry >= 20: raise Exception('retry too many times, give up') if start > end: aggregate_items(start, 2**63-1, hosts, date, retry) aggregate_items(-2**63, end, hosts, date, retry) try: db = getdb() if date is None: date = defaultdate datestr = date date2 = datetime.strptime(date, "%Y-%m-%d")+timedelta(hours=16) date1 = date2 - timedelta(days=60) si = ShopIndex(date) ii = ItemIndex(date) bi = BrandIndex(date) ci = CategoryIndex(date) si.multi() ii.multi() bi.multi() ci.multi() try: if hosts: d2 = calendar.timegm(date2.utctimetuple())*1000 d1 = calendar.timegm(date1.utctimetuple())*1000 host = hosts[0] conn = db.get_connection(host) cur = conn.cursor() cur.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end))) iteminfos = list(cur) cur.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2)) itemts = list(cur) conn.close() else: iteminfos = db.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end)), result=True).results itemts = db.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2), result=True).results except: print('cluster error on host {}, range {}, retry {}, sleeping 5 secs...'.format(hosts[0], (start, end), retry)) hosts = hosts[-1:] + hosts[:-1] #traceback.print_exc() time.sleep(30) return aggregate_items(start, end, date=date, hosts=hosts, retry=retry+1) itemtsdict = {} for row in itemts: itemid, date, values = row[0], row[1], list(row[2:]) # fix data malform # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million if values[0] > 2**24: values[0] = 0 if isinstance(date, datetime): date = (date+timedelta(hours=8)).strftime("%Y-%m-%d") else: date = datetime.utcfromtimestamp(struct.unpack('!q', date)[0]/1000) date = (date+timedelta(hours=8)).strftime("%Y-%m-%d") if itemid not in itemtsdict: itemtsdict[itemid] = {} itemtsdict[itemid][date] = values for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos: if in_blacklist(shopid, price, cid, nc, nr, credit_score, title, type, itemid=itemid): #print itemid, 'skiped' continue brand = clean_brand(brand) if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]: try: if shopid == 0: db.execute('delete from ataobao2.item where id=:id', dict(id=itemid)) db.execute('delete from ataobao2.item_by_date where id=:id', dict(id=itemid)) continue except: traceback.print_exc() try: aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid], shopid, cid, price, brand, name, image, datestr) except: traceback.print_exc() si.execute() bi.execute() ci.execute() ii.execute() except: traceback.print_exc()
#!/usr/bin/env python # -*- coding: utf-8 -*- """ Blacklist Item from our Aggregation Process """ from aggregator.models import getdb import threading lock = threading.Lock() db = getdb() bl_shopblackids = None bl_shopwhiteids = None bl_thresholds = None def load_bls(): global bl_shopblackids global bl_shopwhiteids global bl_thresholds with lock: if bl_shopblackids is None: print 'loading blacklist' bls = set([row for row in db.execute('select type, args, value from ataobao2.blacklist', result=True).results ]) bl_shopblackids = set(int(row[1]) for row in bls if row[0] == 'shopblack') bl_thresholds = {row[1]:float(row[2]) for row in bls if row[0] == 'cateprice'} bl_shopwhiteids = set(int(row[1]) for row in bls if row[0] == 'shopwhite') print 'blacklist loaded' def get_l1_and_l2(cid): from crawler.cates import cates if cid in cates: cidchain = []
#!/usr/bin/env python # -*- coding: utf-8 -*- from aggregator.models import getdb from aggregator.indexes import ShopIndex, ItemIndex, BrandIndex, CategoryIndex from aggregator.processes import Process from datetime import datetime, timedelta from collections import Counter from crawler.cates import l1l2s, topcids import json import traceback defaultdate = (datetime.utcnow()+timedelta(hours=-16)).strftime("%Y-%m-%d") db = getdb() def top10_brands(date=None): if date is None: date = defaultdate bi = BrandIndex(date) top10 = Counter() branddeals = Counter() for cate1 in topcids: for brand, sales in bi.getindex(cate1, 'all'): if brand == '无品牌': continue info = bi.getinfo(brand, cate1, 'all') branddeals[brand] += int(info.get('deals', 0)) top10[brand] += float(sales) top10 = top10.most_common(10)
def save_history_shop(si, date, shopid): db = getdb() shopinfo = si.getbase(shopid) if shopinfo: num_collects = int(shopinfo.get('num_collects', 0)) worth = float(shopinfo.get('worth', 0)) sales = 0 catetrend = {} cates = si.getcates(shopid) c1s = list(set([c[0] for c in cates])) for c1 in c1s: catetrend[c1] = {'rank': si.getrank(c1, 'all', shopid, 'day')} info = si.getinfo(c1, 'all', 'day', shopid) if info: catetrend[c1]['sales'] = float(info.get('sales', 0)) sales += catetrend[c1]['sales'] catetrend[c1]['deals'] = int(info.get('deals', 0)) brandshare = {} for mod in ['mon', 'day']: brandshare[mod] = {} info = si.getbrandinfo(shopid, 'sales', mod) dealsinfo = si.getbrandinfo(shopid, 'deals', mod) binfo = [(brand, float(value), int(dealsinfo.get(brand, 0)) ) for brand, value in info.iteritems() if float(value)>0] total_sales = sum(sales for brand, sales, deals in binfo) if total_sales == 0: top10 = [] else: tops = sorted(binfo, key=itemgetter(1), reverse=True) other_sales = sum(sales for brand, sales, deals in tops[9:]) other_deals = sum(deals for brand, sales, deals in tops[9:]) tops = [(brand.decode('utf-8'), '{:4.2f}%'.format(sales*100/total_sales), sales, deals) for brand, sales, deals in tops[:9] ] if other_sales > 0: tops.append((u'其他', '{:4.2f}%'.format(other_sales*100/total_sales), other_sales, other_deals)) top10 = tops brandshare[mod] = top10 cateshare = {} for mod in ['mon', 'day']: cinfo = [] total_sales = 0 for cate1, cate2 in cates: info = si.getinfo(cate1, cate2, mod, shopid) if info and float(info.get('sales', 0)) > 0: total_sales += float(info.get('sales', 0)) cinfo.append((cate2, float(info.get('sales', 0)), int(info.get('deals', 0)))) if total_sales == 0: top10 = [] else: tops = sorted(cinfo, key=itemgetter(1), reverse=True) other_sales = sum(sales for cate2, sales, deals in tops[9:]) other_deals = sum(deals for cate2, sales, deals in tops[9:]) tops = [(cate2, '{:4.2f}%'.format(sales*100/total_sales), sales, deals) for cate2, sales, deals in tops[:9]] if other_sales > 0: tops.append((u'其他', '{:4.2f}'.format(other_sales*100/total_sales), other_sales, other_deals)) top10 = tops cateshare[mod] = top10 db.execute('''insert into ataobao2.shop_by_date (id, datestr, worth, sales, num_collects, catetrend, brandshare, cateshare) values (:shopid, :datestr, :worth, :sales, :num_collects, :catetrend, :brandshare, :cateshare)''', dict(worth=worth, num_collects=num_collects, shopid=shopid, datestr=date, sales=total_sales, catetrend=json.dumps(catetrend), brandshare=json.dumps(brandshare), cateshare=json.dumps(cateshare)))
def mark_ready(date): db = getdb('db1') db.execute( 'insert into ataobao2.agghosts (datestr, ready) values (:date, true)', dict(date=date))
def mark_ready(date): db = getdb('db1') db.execute('insert into ataobao2.agghosts (datestr, ready) values (:date, true)', dict(date=date))