예제 #1
0
def aggregate_brands(date, *brands):
    ci = CategoryIndex(date)
    bi = BrandIndex(date)
    try:
        ci.multi()
        bi.multi()
        for brand in brands:
            aggregate_brand(bi, ci, date, brand)

        ci.execute()
        bi.execute()
    except:
        traceback.print_exc()
예제 #2
0
def top10_brands(date=None):
    if date is None:
        date = defaultdate
    bi = BrandIndex(date)
    top10 = Counter()
    branddeals = Counter()
    for cate1 in topcids:
        for brand, sales in bi.getindex(cate1, 'all'):
            if brand == '无品牌':
                continue
            info = bi.getinfo(brand, cate1, 'all')
            branddeals[brand] += int(info.get('deals', 0))
            top10[brand] += float(sales)
    top10 = top10.most_common(10)
    top10brand = []
    for brand, sales in top10:
        deals = branddeals[brand]
        brand = brand.decode('utf-8')
        top10brand.append((brand, deals, sales))
    r = db.execute('''insert into ataobao2.top10 (datestr, field, value) values
                    (:datestr, :field, :value)''', dict(datestr=date, field='brand', value=json.dumps(top10brand)))
예제 #3
0
파일: brandes.py 프로젝트: yankaics/data007
def es_brands(brands, date=None):
    try:
        if date is None:
            date = defaultdate
        bi = BrandIndex(date)
        for brand in brands:
            try:
                es_brand(bi, date, brand)
            except:
                traceback.print_exc()
        flush()
    except:
        traceback.print_exc()
예제 #4
0
def top10_brands(date=None):
    if date is None:
        date = defaultdate
    bi = BrandIndex(date)
    top10 = Counter()
    branddeals = Counter()
    for cate1 in topcids:
        for brand, sales in bi.getindex(cate1, 'all'):
            if brand == '无品牌':
                continue
            info = bi.getinfo(brand, cate1, 'all')
            branddeals[brand] += int(info.get('deals', 0))
            top10[brand] += float(sales)
    top10 = top10.most_common(10)
    top10brand = []
    for brand, sales in top10:
        deals = branddeals[brand]
        brand = brand.decode('utf-8')
        top10brand.append((brand, deals, sales))
    r = db.execute(
        '''insert into ataobao2.top10 (datestr, field, value) values
                    (:datestr, :field, :value)''',
        dict(datestr=date, field='brand', value=json.dumps(top10brand)))
예제 #5
0
 def generate_tasks(self):
     self.clear_redis()
     bi = BrandIndex(self.date)
     ci = CategoryIndex(self.date)
     #from aggregator.brands import brands as brands1
     #brands2 = set(b.decode('utf-8') for b in bi.getbrands())
     #brands = list(brands1 & brands2)
     allbrands = set()
     for cate1, cate2 in l1l2s:
         brands1 = ci.getbrandnames(cate1, cate2)
         brands = list(brands1 - allbrands)
         allbrands.update(brands1)
         for i in range(1 + len(brands) / self.step):
             bs = brands[i * self.step:(i + 1) * self.step]
             self.add_task('aggregator.brandagg.aggregate_brands',
                           self.date, *bs)
     self.finish_generation()
예제 #6
0
def aggregate_brands(date, *brands):
    ci = CategoryIndex(date)
    bi = BrandIndex(date)
    try:
        ci.multi()
        bi.multi()
        for brand in brands:
            aggregate_brand(bi, ci, date, brand)

        ci.execute()
        bi.execute()
    except:
        traceback.print_exc()
예제 #7
0
def aggregate_items(start, end, hosts=[], date=None, retry=0):
    if retry >= 20:
        raise Exception('retry too many times, give up')

    if start > end:
        aggregate_items(start, 2**63-1, hosts, date, retry)
        aggregate_items(-2**63, end, hosts, date, retry)

    try:
        db = getdb()
        if date is None:
            date = defaultdate
        datestr = date
        date2 = datetime.strptime(date, "%Y-%m-%d")+timedelta(hours=16)
        date1 = date2 - timedelta(days=60)
        si = ShopIndex(date)
        ii = ItemIndex(date)
        bi = BrandIndex(date)
        ci = CategoryIndex(date)
        si.multi()
        ii.multi()
        bi.multi()
        ci.multi()

        try:
            if hosts:
                d2 = calendar.timegm(date2.utctimetuple())*1000
                d1 = calendar.timegm(date1.utctimetuple())*1000
                host = hosts[0]
                conn = db.get_connection(host)
                cur = conn.cursor()
                cur.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)))
                iteminfos = list(cur)
                cur.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2))
                itemts = list(cur)
                conn.close()
            else:
                iteminfos = db.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)), result=True).results
                itemts = db.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2), result=True).results
        except:
            print('cluster error on host {}, range {}, retry {}, sleeping 5 secs...'.format(hosts[0], (start, end), retry))
            hosts = hosts[-1:] + hosts[:-1]
            #traceback.print_exc()
            time.sleep(30)
            return aggregate_items(start, end, date=date, hosts=hosts, retry=retry+1)


        itemtsdict = {}
        for row in itemts:
            itemid, date, values = row[0], row[1], list(row[2:])
            # fix data malform
            # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million
            if values[0] > 2**24:
                values[0] = 0
            if isinstance(date, datetime):
                date = (date+timedelta(hours=8)).strftime("%Y-%m-%d")
            else:
                date = datetime.utcfromtimestamp(struct.unpack('!q', date)[0]/1000)
                date = (date+timedelta(hours=8)).strftime("%Y-%m-%d")
            if itemid not in itemtsdict:
                itemtsdict[itemid] = {}
            itemtsdict[itemid][date] = values


        for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos:
            if in_blacklist(shopid, price, cid, nc, nr, credit_score, title, type, itemid=itemid):
                #print itemid, 'skiped'
                continue
            brand = clean_brand(brand)
            if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]:
                try:
                    if shopid == 0:
                        db.execute('delete from ataobao2.item where id=:id', dict(id=itemid))
                        db.execute('delete from ataobao2.item_by_date where id=:id', dict(id=itemid))
                        continue
                except:
                    traceback.print_exc()
                try:
                    aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid], shopid, cid, price, brand, name, image, datestr)
                except:
                    traceback.print_exc()

        si.execute()
        bi.execute()
        ci.execute()
        ii.execute()
    except:
        traceback.print_exc()
예제 #8
0
파일: itemagg.py 프로젝트: yankaics/data007
def aggregate_items(start, end, hosts=[], date=None, retry=0):
    if retry >= 20:
        raise Exception('retry too many times, give up')

    if start > end:
        aggregate_items(start, 2**63 - 1, hosts, date, retry)
        aggregate_items(-2**63, end, hosts, date, retry)

    try:
        db = getdb()
        if date is None:
            date = defaultdate
        datestr = date
        date2 = datetime.strptime(date, "%Y-%m-%d") + timedelta(hours=16)
        date1 = date2 - timedelta(days=60)
        si = ShopIndex(date)
        ii = ItemIndex(date)
        bi = BrandIndex(date)
        ci = CategoryIndex(date)
        si.multi()
        ii.multi()
        bi.multi()
        ci.multi()

        try:
            if hosts:
                d2 = calendar.timegm(date2.utctimetuple()) * 1000
                d1 = calendar.timegm(date1.utctimetuple()) * 1000
                host = hosts[0]
                conn = db.get_connection(host)
                cur = conn.cursor()
                cur.execute(
                    '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)))
                iteminfos = list(cur)
                cur.execute(
                    '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2))
                itemts = list(cur)
                conn.close()
            else:
                iteminfos = db.execute(
                    '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)),
                    result=True).results
                itemts = db.execute(
                    '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2),
                    result=True).results
        except:
            print(
                'cluster error on host {}, range {}, retry {}, sleeping 5 secs...'
                .format(hosts[0], (start, end), retry))
            hosts = hosts[-1:] + hosts[:-1]
            #traceback.print_exc()
            time.sleep(30)
            return aggregate_items(start,
                                   end,
                                   date=date,
                                   hosts=hosts,
                                   retry=retry + 1)

        itemtsdict = {}
        for row in itemts:
            itemid, date, values = row[0], row[1], list(row[2:])
            # fix data malform
            # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million
            if values[0] > 2**24:
                values[0] = 0
            if isinstance(date, datetime):
                date = (date + timedelta(hours=8)).strftime("%Y-%m-%d")
            else:
                date = datetime.utcfromtimestamp(
                    struct.unpack('!q', date)[0] / 1000)
                date = (date + timedelta(hours=8)).strftime("%Y-%m-%d")
            if itemid not in itemtsdict:
                itemtsdict[itemid] = {}
            itemtsdict[itemid][date] = values

        for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos:
            if in_blacklist(shopid,
                            price,
                            cid,
                            nc,
                            nr,
                            credit_score,
                            title,
                            type,
                            itemid=itemid):
                #print itemid, 'skiped'
                continue
            brand = clean_brand(brand)
            if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]:
                try:
                    if shopid == 0:
                        db.execute('delete from ataobao2.item where id=:id',
                                   dict(id=itemid))
                        db.execute(
                            'delete from ataobao2.item_by_date where id=:id',
                            dict(id=itemid))
                        continue
                except:
                    traceback.print_exc()
                try:
                    aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid],
                                   shopid, cid, price, brand, name, image,
                                   datestr)
                except:
                    traceback.print_exc()

        si.execute()
        bi.execute()
        ci.execute()
        ii.execute()
    except:
        traceback.print_exc()