def test_price_save_success(self):
     print("Validating save price success")
     global new_price
     pr = Price(new_price)
     pr.part = 1
     result = pr.save_all()
     self.assertEqual(result, True)
 def test_price_validation(self):
     """ Testing DB prices i
     """
     global new_price
     print("Testing price validation")
     validate = Price.validate(new_price)
     print("------------")
     print(validate)
     self.assertTrue(validate)
def populate_geoprice_tables(val):
    """ Populate all tables in GeoPrice KS
        
        Params:
        -----
        val : dict
            Price value to insert
    """
    price_val = format_price(val)
    price = Price(price_val)
    #logger.debug("Formatted price info..")
    try:
        if type(price.product_uuid) is float and np.isnan(price.product_uuid):
            raise Exception("Product UUID needs to be generated!")
    except Exception as e:
        return False
    #logger.info("Saving All...")
    if price.save_all_batch():
        #logger.debug("Loaded tables for: {}".format(val['product_uuid']))
        pass
def callback(ch, method, properties, body):
    global gcounter
    try:
        new_price = json.loads(body.decode('utf-8'))
        logger.debug("Price " + new_price['retailer'] + " - " +
                     new_price['product_uuid'] + " - stores: " +
                     str(len(new_price['location']['store'])))
        # Valuamos las variables recibidas para verificar que tenga todos los datos
        if not Price.validate(new_price):
            logger.warning('Could not validate price')
            pass
        else:
            price = Price(new_price)
            # Set Partition value
            price.part = gcounter
            # Save elements
            price.save_all()
            logger.info('Saved price for ' + price.retailer + ' ' +
                        str(price.product_uuid))
            # Publish message to price-cache
            if q_cache in g._producer and g._producer[q_cache]:
                g._producer[q_cache].send(new_price)
                # Modify partition to distribute
                if gcounter >= 20:
                    gcounter = 1
                else:
                    gcounter += 1
            else:
                logger.warning("Producer not initialized!")
                logger.error(g._producer)
    except NoHostAvailable as e:
        logger.error("No Cassandra host available, shutting down...")
        logger.error(e)
        sys.exit()
    except Exception as e:
        logger.error(e)
    ch.basic_ack(delivery_tag=method.delivery_tag)
Exemple #5
0
    def grouped_by_retailer(task_id, filters, rets, _from, until, interval):
        """ Static method to retrieve passed prices by all 
            available stores given UUIDS and retailer keys
            from previous 24 hrs. 
            Result is temporarly stored in Redis

            Params:
            -----
            - task_id: (str) Celery Task ID
            - filters: (list) Item Filters
            - rets: (list) Needed Retailers
            - _from: (str) Date Lower Bound
            - until: (str) Date Upper Bound
            - interval: (str) Interval Type (day, month, week)
        """
        # Task initialization
        task = Task(task_id)
        task.task_id = task_id
        task.progress = 1

        # Decompose filters
        dates, retailers, \
            retailers_by_key, \
            stores, stores_by_uuid, \
            items, items_by_uuid, \
            prods, prods_by_uuid = Historia.decompose_filters(
                                task,
                                filters,
                                rets,
                                _from,
                                until
                            )
        logger.info("Got products from filters")
        # Get prices of all the products
        prices = Price.query_by_product_store(
            stores=list(stores_by_uuid.keys()),
            products=list(prods_by_uuid.keys()),
            dates=dates)
        task.progress = 60
        # If no prices, end task...
        if not prices:
            logger.warning("No prices found!")
            raise Exception("No se encontraron precios, intenta nuevamente")
        # Format prices
        hist_df = pd.DataFrame(prices)
        hist_df['product_uuid'] = hist_df.product_uuid.apply(lambda x: str(x))
        hist_df['store_uuid'] = hist_df.store_uuid.apply(lambda x: str(x))
        hist_df['item_uuid'] = hist_df.product_uuid.apply(
            lambda z: prods_by_uuid[z]['item_uuid'])
        # Add JS-like timestamp
        hist_df.sort_values(['time'], ascending=False, inplace=True)
        hist_df['time_js'] = hist_df.time\
                                    .apply(lambda djs: \
                                            (djs - \
                                            datetime.datetime(1970, 1, 1,0,0))\
                                            /datetime.timedelta(seconds=1)\
                                            * 1000)
        hist_df['date'] = hist_df.time.apply(lambda d: d.date().isoformat())
        # Remove Duplicates, take more recent
        hist_df.drop_duplicates(['item_uuid', 'store_uuid', 'date'],
                                keep='first',
                                inplace=True)
        task.progress = 70
        # Add name and gtin
        hist_df['gtin'] = hist_df.item_uuid.apply(
            lambda z: items_by_uuid[z]['gtin'])
        hist_df['name'] = hist_df.product_uuid.apply(
            lambda z: prods_by_uuid[z]['name'])
        hist_df['store'] = hist_df.store_uuid.apply(
            lambda z: stores_by_uuid[z]['name'])
        hist_df['retailer'] = hist_df.retailer.apply(
            lambda z: retailers_by_key[z])
        # Add Grouping dates
        hist_df = Historia.add_dates(hist_df)
        task.progress = 80
        # Add Retailer Format
        ret_res = []
        for _gk, _gdf in hist_df.groupby(['retailer']):
            _tmp = {"name": _gk, "data": []}
            # Once grouped by retailer aggregate by interval
            for _dk, _ddf in _gdf.groupby(grouping_cols[interval]):
                _tmp['data'].append(
                    [int(_ddf.time_js.min()),
                     round(_ddf.price.mean(), 2)])
            # Append to result
            ret_res.append(_tmp)
        task.progress = 90
        logger.info('Created Retailer result')
        # Add Metrics Format
        met_res = {'avg': [], 'min': [], 'max': []}
        for mj, mrow in hist_df.groupby(grouping_cols[interval]):
            # Take JS Timestamp
            _tmpjs = int(mrow.time_js.min())
            met_res['avg'].append([_tmpjs, round(mrow.price.mean(), 2)])
            met_res['min'].append([_tmpjs, round(mrow.price.min(), 2)])
            met_res['max'].append([_tmpjs, round(mrow.price.max(), 2)])
        logger.info('Created Metrics result')
        # Acumulated result
        result = {
            "metrics": met_res,
            "retailers": ret_res,
            "title": "Tendencia de Precios Regionalizados",
            "subtitle": ("<b> Periodo: </b> {} - {} <br>"\
                    + "<b> Retailers: </b> {}").format(
                        hist_df.time.min().date().isoformat(),
                        hist_df.time.max().date().isoformat(),
                        ', '.join([z['name'] for z in ret_res])
                    )
        }
        task.progress = 100
        return {'data': result, 'msg': 'Task test OK'}
Exemple #6
0
    def get_compare_by_store(filters, rets, _ini, _fin, _inter):
        """ Query cassandra and grop response by items vs stores

            Params:
            -----
            - filters: (list) 
                Item Filters [item_uuids, store_uuids, retailers]
            - rets: (list) List of posible retailers to query depending
                    on the client configurations
            - date_start: (str) 
                ISO format Start Date
            - date_end: (str) 
                ISO format End Date 
            - interval: (str) 
                Time interval  
        """
        # Validate Params and decompose filters
        dates, retailers, \
            retailers_by_key, \
            stores, stores_by_uuid, \
            items, items_by_uuid, \
            prods, prods_by_uuid = Dump.decompose_filters(
                                filters,
                                rets,
                                _ini,
                                _fin
                            )
        # Start fetching prices
        prices = Price.query_by_product_store(
            stores=list(stores_by_uuid.keys()),
            products=list(prods_by_uuid.keys()),
            dates=dates)
        if not prices:
            logger.warning("No prices found")
            # Put an empty result
            raise Exception('No se encontraron precios, intenta nuevamente')

        # Load DF and filter results
        prices_df = pd.DataFrame(prices)
        prices_df.product_uuid = prices_df.product_uuid.apply(lambda x: str(x))
        prices_df['item_uuid'] = prices_df.product_uuid.apply(
            lambda z: prods_by_uuid[z]['item_uuid'])
        prices_df.store_uuid = prices_df.store_uuid.apply(lambda x: str(x))

        # Remove Duplicates by day, take more recent
        prices_df.sort_values(['time'], ascending=False, inplace=True)
        prices_df\
            .drop_duplicates(
                    ['item_uuid', 'store_uuid', 'date'],
                    keep='first',
                    inplace=True
        )
        logger.info('Droping duplicates and starting to format..')
        # Merge with item and stores DF
        items_df = pd.DataFrame(items)
        items_df.item_uuid = items_df.item_uuid.apply(lambda x: str(x))
        stores_df = pd.DataFrame(stores).rename(columns={
            "uuid": "store_uuid",
            "name": "store"
        })
        prices_df = pd.merge(prices_df, items_df, on='item_uuid', how='left')
        prices_df = pd.merge(prices_df,
                             stores_df[['store_uuid', 'store']],
                             on='store_uuid',
                             how='left')
        #prices_df.retailer_name = prices_df.retailer.apply(lambda x: rets[x])
        prices_df['retailer_name'] = prices_df['retailer'].apply(
            lambda x: rets[x])

        # Add Grouping dates
        prices_df['day'] = prices_df['time'].apply(lambda x: x.day)
        prices_df['month'] = prices_df['time'].apply(lambda x: x.month)
        prices_df['year'] = prices_df['time'].apply(lambda x: x.year)
        prices_df['week'] = prices_df['time'].apply(
            lambda x: x.isocalendar()[1])

        # Verify if **grouping_periods** is needed
        logger.info('Finished format, writing result..')
        table_result = {}
        # Group by retailer
        table_result = []
        # Group by intervals, item and store
        for temp_int, i_df in prices_df.groupby(Dump.grouping_cols[_inter] +
                                                ['item_uuid', 'store_uuid']):
            table_result.append({
                'gtin':
                i_df.gtin.tolist()[0],
                'item_uuid':
                i_df.item_uuid.tolist()[0],
                'store_uuid':
                i_df.store_uuid.tolist()[0],
                'retailer':
                i_df.retailer.tolist()[0],
                'retailer_name':
                i_df.retailer_name.tolist()[0],
                'name':
                i_df.name.tolist()[0],
                'promo':
                i_df.promo.tolist()[0],
                'store':
                "{} - {}".format(i_df.retailer_name.tolist()[0],
                                 i_df.store.tolist()[0]),
                'price':
                round(i_df.price.mean(), 2),
                'date':
                i_df.time.tolist()[0].date().isoformat()
            })
        return table_result
Exemple #7
0
def aggregate_daily(daily_files, _day):
    """ Aggregate data to compute statistics
        and batch load it in to C* table

        Params:
        -----
        daily : list
            Daily prices tmp files 
        day : datetime.date
            Date to set in aggregates
    """
    def _mode(x):
        try:
            return stats.mode(x)[0][0]
        except:
            return np.median(x)

    # File name acumulator
    agg_files = []
    # Compute and reduce number of rows
    for dfile in daily_files:
        daily = pd.read_csv(dfile, low_memory=False)
        # Aggregate data to compute mean, std, max, min, etc.
        aggr_stats = daily.groupby(['product_uuid', 'date']).price\
            .agg([('max_price', 'max'),
                ('avg_price', 'mean'),
                ('min_price', 'min'),
                ('datapoints', 'count'),
                ('std_price', 'std'),
                ('mode_price', lambda x: _mode(x))
                ])
        aggr_stats.fillna(0.0, inplace=True)
        aggr_stats.reset_index(inplace=True)
        # Add retailer
        aggr_stats = pd.merge(aggr_stats,
                              daily[['product_uuid', 'source'
                                     ]].drop_duplicates('product_uuid'),
                              on='product_uuid',
                              how='left')
        # Write agg file
        fagg_name = dfile.replace('.csv', '_agg.csv')
        aggr_stats.to_csv(fagg_name)
        # Accumulate agg files
        agg_files.append(fagg_name)
    logger.info("Computed all mini batch aggregates!")
    # Read all agg files and compute
    daily_aggs = pd.concat(
        [pd.read_csv(af, low_memory=False) for af in agg_files])

    def compute_overall_stats(z):
        """ Calculate stats of stats
        """
        def _mode(x):
            try:
                return stats.mode(x)[0][0]
            except:
                return np.median(x)
        return pd.Series({
            "max_price": z.max_price.max(),
            "min_price": z.min_price.min(),
            "avg_price": ((z.avg_price * z.datapoints).sum() / z.datapoints.sum()) \
                if z.datapoints.sum() else 0.0,
            "datapoints": z.datapoints.sum(),
            "std_price": math.sqrt((z.std_price * z.std_price * z.datapoints).sum() / z.datapoints.sum()) \
                if z.datapoints.sum() else 0.0,
            "mode_price": _mode(z.mode_price.tolist())
        })

    # Aggregate data to compute max
    all_aggr_stats = daily_aggs\
        .groupby(['product_uuid', 'date'])\
        .apply(compute_overall_stats)\
        .fillna(0.0)\
        .reset_index()
    # Aggregate data, add retailer
    all_aggr_stats = pd.merge(all_aggr_stats,
                              daily_aggs[['product_uuid', 'source'
                                          ]].drop_duplicates('product_uuid'),
                              on="product_uuid",
                              how="left")
    # Cast
    all_aggr_stats['datapoints'] = all_aggr_stats['datapoints'].astype(int)
    all_aggr_stats['product_uuid'] = all_aggr_stats['product_uuid'].apply(
        lambda y: UUID(y))
    # Set the date passed
    all_aggr_stats['date'] = int(_day.strftime('%Y%m%d'))
    # Load each element into C*
    for elem in tqdm(all_aggr_stats.to_dict(orient='records'),
                     desc="Writing.."):
        Price.delete_stats_by_product(elem)
        Price.save_stats_by_product(elem)

    # Disply metrics
    logger.info("Stored {} daily prices".format(len(all_aggr_stats)))
    logger.info("Prices had the following distribution:\n {}".format(
        all_aggr_stats.datapoints.describe()))
    # Delete all temp_fils
    for taf in agg_files:
        os.remove(taf)
    for taf in daily_files:
        os.remove(taf)
def dump_catalogue():
    """ Get the entire catalogue of a retailer and download it
    """
    logger.info("Start retrieving catalogue")
    retailer = request.args.get("retailer", None)
    retailer_name = request.args.get("retailer_name", retailer)
    store_uuid = request.args.get("store_uuid", None)
    store_name = request.args.get("store_name", "Default")
    fmt = request.args.get("fmt", "csv")

    try:
        hours = int(request.args.get("hours", 32))
    except:
        hours = 32
    if 'extras' in request.args:
        extras = request.args.get("extras").split(',')
    else:
        extras = []

    # If not retailer or store, raise app error
    if not retailer and not store_uuid:
        raise errors.AppError("dump_error", "Missing parameters in request")

    items = []
    items_ret = []
    data_sources = request.args.get("data_source", None).split(',')
    for data_source in data_sources:

        # Get all the items
        logger.info("Getting total items from {}".format(data_source))
        items = items + g._catalogue.get_by_source(
            data_source=data_source, cols=extras + ['item_uuid'], qsize=2000)
        logger.info("Got {} total items".format(len(items)))
        if len(items_ret) == 0:
            items_ret = items_ret + g._catalogue.get_by_source(
                data_source=retailer,
                cols=extras + ['item_uuid', 'gtin'],
                qsize=2000)

    # Fetch UUIDS only with existing Item  UUID
    _uuids = set(i['item_uuid'] for i in items if i['item_uuid'])
    _uuids_ret = {i['product_uuid']: i for i in items_ret}

    # Get all the prices of the retailer
    #logger.info("Got {} total items".format(len(items)))
    logger.info("Getting prices from C* form the last {} hours".format(hours))
    catalogue = Price.get_by_store(store_uuid, hours)

    # Only the items that are permitted
    valid = []
    logger.info("Got {} prices".format(len(catalogue)))
    logger.info("Looping through catalogue")
    for c in catalogue:
        try:
            tmp = _uuids_ret[c['product_uuid']]
            # Filter to not return products from outside the data source
            if tmp['item_uuid'] not in _uuids:
                continue
            # Format
            ord_d = OrderedDict([("gtin", tmp['gtin']),
                                 ("item_uuid", tmp.get('item_uuid', None)),
                                 ("name", tmp['name']), ("price", c['price']),
                                 ("price_original", c['price_original']),
                                 ("discount",
                                  (c['price'] - c['price_original'])),
                                 ("promo", c['promo']),
                                 ("retailer", retailer_name),
                                 ("store", store_name)])
            for ex in extras:
                ord_d.update([(ex, tmp[ex])])
            valid.append(ord_d)
        except Exception as e:
            logger.error(e)

    # Build dataframe
    df = pd.DataFrame(valid)
    logger.info("Serving catalogue - {} prods".format(len(df)))
    if fmt == 'json':
        # Transform to dict
        table_head = list(df.columns)
        table_body = [list(v) for v in list(df.values)]
        logger.info("Serving JSON")
        return jsonify({"columns": table_head, "records": table_body})
    # If direct download, drop item_uuid
    df.drop('item_uuid', axis=1, inplace=True)
    return download_dataframe(
        df,
        fmt=fmt,
        name="catalogue_{}_{}".format(
            retailer,
            datetime.datetime.utcnow().strftime("%Y-%m-%d")))