Beispiel #1
0
def aggregate_brands(date, *brands):
    ci = CategoryIndex(date)
    bi = BrandIndex(date)
    try:
        ci.multi()
        bi.multi()
        for brand in brands:
            aggregate_brand(bi, ci, date, brand)

        ci.execute()
        bi.execute()
    except:
        traceback.print_exc()
Beispiel #2
0
def aggregate_brands(date, *brands):
    ci = CategoryIndex(date)
    bi = BrandIndex(date)
    try:
        ci.multi()
        bi.multi()
        for brand in brands:
            aggregate_brand(bi, ci, date, brand)

        ci.execute()
        bi.execute()
    except:
        traceback.print_exc()
Beispiel #3
0
def aggregate_items(start, end, hosts=[], date=None, retry=0):
    if retry >= 20:
        raise Exception('retry too many times, give up')

    if start > end:
        aggregate_items(start, 2**63-1, hosts, date, retry)
        aggregate_items(-2**63, end, hosts, date, retry)

    try:
        db = getdb()
        if date is None:
            date = defaultdate
        datestr = date
        date2 = datetime.strptime(date, "%Y-%m-%d")+timedelta(hours=16)
        date1 = date2 - timedelta(days=60)
        si = ShopIndex(date)
        ii = ItemIndex(date)
        bi = BrandIndex(date)
        ci = CategoryIndex(date)
        si.multi()
        ii.multi()
        bi.multi()
        ci.multi()

        try:
            if hosts:
                d2 = calendar.timegm(date2.utctimetuple())*1000
                d1 = calendar.timegm(date1.utctimetuple())*1000
                host = hosts[0]
                conn = db.get_connection(host)
                cur = conn.cursor()
                cur.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)))
                iteminfos = list(cur)
                cur.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2))
                itemts = list(cur)
                conn.close()
            else:
                iteminfos = db.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)), result=True).results
                itemts = db.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2), result=True).results
        except:
            print('cluster error on host {}, range {}, retry {}, sleeping 5 secs...'.format(hosts[0], (start, end), retry))
            hosts = hosts[-1:] + hosts[:-1]
            #traceback.print_exc()
            time.sleep(30)
            return aggregate_items(start, end, date=date, hosts=hosts, retry=retry+1)


        itemtsdict = {}
        for row in itemts:
            itemid, date, values = row[0], row[1], list(row[2:])
            # fix data malform
            # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million
            if values[0] > 2**24:
                values[0] = 0
            if isinstance(date, datetime):
                date = (date+timedelta(hours=8)).strftime("%Y-%m-%d")
            else:
                date = datetime.utcfromtimestamp(struct.unpack('!q', date)[0]/1000)
                date = (date+timedelta(hours=8)).strftime("%Y-%m-%d")
            if itemid not in itemtsdict:
                itemtsdict[itemid] = {}
            itemtsdict[itemid][date] = values


        for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos:
            if in_blacklist(shopid, price, cid, nc, nr, credit_score, title, type, itemid=itemid):
                #print itemid, 'skiped'
                continue
            brand = clean_brand(brand)
            if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]:
                try:
                    if shopid == 0:
                        db.execute('delete from ataobao2.item where id=:id', dict(id=itemid))
                        db.execute('delete from ataobao2.item_by_date where id=:id', dict(id=itemid))
                        continue
                except:
                    traceback.print_exc()
                try:
                    aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid], shopid, cid, price, brand, name, image, datestr)
                except:
                    traceback.print_exc()

        si.execute()
        bi.execute()
        ci.execute()
        ii.execute()
    except:
        traceback.print_exc()
Beispiel #4
0
def aggregate_items(start, end, hosts=[], date=None, retry=0):
    if retry >= 20:
        raise Exception('retry too many times, give up')

    if start > end:
        aggregate_items(start, 2**63 - 1, hosts, date, retry)
        aggregate_items(-2**63, end, hosts, date, retry)

    try:
        db = getdb()
        if date is None:
            date = defaultdate
        datestr = date
        date2 = datetime.strptime(date, "%Y-%m-%d") + timedelta(hours=16)
        date1 = date2 - timedelta(days=60)
        si = ShopIndex(date)
        ii = ItemIndex(date)
        bi = BrandIndex(date)
        ci = CategoryIndex(date)
        si.multi()
        ii.multi()
        bi.multi()
        ci.multi()

        try:
            if hosts:
                d2 = calendar.timegm(date2.utctimetuple()) * 1000
                d1 = calendar.timegm(date1.utctimetuple()) * 1000
                host = hosts[0]
                conn = db.get_connection(host)
                cur = conn.cursor()
                cur.execute(
                    '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)))
                iteminfos = list(cur)
                cur.execute(
                    '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2))
                itemts = list(cur)
                conn.close()
            else:
                iteminfos = db.execute(
                    '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)),
                    result=True).results
                itemts = db.execute(
                    '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2),
                    result=True).results
        except:
            print(
                'cluster error on host {}, range {}, retry {}, sleeping 5 secs...'
                .format(hosts[0], (start, end), retry))
            hosts = hosts[-1:] + hosts[:-1]
            #traceback.print_exc()
            time.sleep(30)
            return aggregate_items(start,
                                   end,
                                   date=date,
                                   hosts=hosts,
                                   retry=retry + 1)

        itemtsdict = {}
        for row in itemts:
            itemid, date, values = row[0], row[1], list(row[2:])
            # fix data malform
            # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million
            if values[0] > 2**24:
                values[0] = 0
            if isinstance(date, datetime):
                date = (date + timedelta(hours=8)).strftime("%Y-%m-%d")
            else:
                date = datetime.utcfromtimestamp(
                    struct.unpack('!q', date)[0] / 1000)
                date = (date + timedelta(hours=8)).strftime("%Y-%m-%d")
            if itemid not in itemtsdict:
                itemtsdict[itemid] = {}
            itemtsdict[itemid][date] = values

        for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos:
            if in_blacklist(shopid,
                            price,
                            cid,
                            nc,
                            nr,
                            credit_score,
                            title,
                            type,
                            itemid=itemid):
                #print itemid, 'skiped'
                continue
            brand = clean_brand(brand)
            if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]:
                try:
                    if shopid == 0:
                        db.execute('delete from ataobao2.item where id=:id',
                                   dict(id=itemid))
                        db.execute(
                            'delete from ataobao2.item_by_date where id=:id',
                            dict(id=itemid))
                        continue
                except:
                    traceback.print_exc()
                try:
                    aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid],
                                   shopid, cid, price, brand, name, image,
                                   datestr)
                except:
                    traceback.print_exc()

        si.execute()
        bi.execute()
        ci.execute()
        ii.execute()
    except:
        traceback.print_exc()