def test_price_save_success(self): print("Validating save price success") global new_price pr = Price(new_price) pr.part = 1 result = pr.save_all() self.assertEqual(result, True)
def test_price_validation(self): """ Testing DB prices i """ global new_price print("Testing price validation") validate = Price.validate(new_price) print("------------") print(validate) self.assertTrue(validate)
def populate_geoprice_tables(val): """ Populate all tables in GeoPrice KS Params: ----- val : dict Price value to insert """ price_val = format_price(val) price = Price(price_val) #logger.debug("Formatted price info..") try: if type(price.product_uuid) is float and np.isnan(price.product_uuid): raise Exception("Product UUID needs to be generated!") except Exception as e: return False #logger.info("Saving All...") if price.save_all_batch(): #logger.debug("Loaded tables for: {}".format(val['product_uuid'])) pass
def callback(ch, method, properties, body): global gcounter try: new_price = json.loads(body.decode('utf-8')) logger.debug("Price " + new_price['retailer'] + " - " + new_price['product_uuid'] + " - stores: " + str(len(new_price['location']['store']))) # Valuamos las variables recibidas para verificar que tenga todos los datos if not Price.validate(new_price): logger.warning('Could not validate price') pass else: price = Price(new_price) # Set Partition value price.part = gcounter # Save elements price.save_all() logger.info('Saved price for ' + price.retailer + ' ' + str(price.product_uuid)) # Publish message to price-cache if q_cache in g._producer and g._producer[q_cache]: g._producer[q_cache].send(new_price) # Modify partition to distribute if gcounter >= 20: gcounter = 1 else: gcounter += 1 else: logger.warning("Producer not initialized!") logger.error(g._producer) except NoHostAvailable as e: logger.error("No Cassandra host available, shutting down...") logger.error(e) sys.exit() except Exception as e: logger.error(e) ch.basic_ack(delivery_tag=method.delivery_tag)
def grouped_by_retailer(task_id, filters, rets, _from, until, interval): """ Static method to retrieve passed prices by all available stores given UUIDS and retailer keys from previous 24 hrs. Result is temporarly stored in Redis Params: ----- - task_id: (str) Celery Task ID - filters: (list) Item Filters - rets: (list) Needed Retailers - _from: (str) Date Lower Bound - until: (str) Date Upper Bound - interval: (str) Interval Type (day, month, week) """ # Task initialization task = Task(task_id) task.task_id = task_id task.progress = 1 # Decompose filters dates, retailers, \ retailers_by_key, \ stores, stores_by_uuid, \ items, items_by_uuid, \ prods, prods_by_uuid = Historia.decompose_filters( task, filters, rets, _from, until ) logger.info("Got products from filters") # Get prices of all the products prices = Price.query_by_product_store( stores=list(stores_by_uuid.keys()), products=list(prods_by_uuid.keys()), dates=dates) task.progress = 60 # If no prices, end task... if not prices: logger.warning("No prices found!") raise Exception("No se encontraron precios, intenta nuevamente") # Format prices hist_df = pd.DataFrame(prices) hist_df['product_uuid'] = hist_df.product_uuid.apply(lambda x: str(x)) hist_df['store_uuid'] = hist_df.store_uuid.apply(lambda x: str(x)) hist_df['item_uuid'] = hist_df.product_uuid.apply( lambda z: prods_by_uuid[z]['item_uuid']) # Add JS-like timestamp hist_df.sort_values(['time'], ascending=False, inplace=True) hist_df['time_js'] = hist_df.time\ .apply(lambda djs: \ (djs - \ datetime.datetime(1970, 1, 1,0,0))\ /datetime.timedelta(seconds=1)\ * 1000) hist_df['date'] = hist_df.time.apply(lambda d: d.date().isoformat()) # Remove Duplicates, take more recent hist_df.drop_duplicates(['item_uuid', 'store_uuid', 'date'], keep='first', inplace=True) task.progress = 70 # Add name and gtin hist_df['gtin'] = hist_df.item_uuid.apply( lambda z: items_by_uuid[z]['gtin']) hist_df['name'] = hist_df.product_uuid.apply( lambda z: prods_by_uuid[z]['name']) hist_df['store'] = hist_df.store_uuid.apply( lambda z: stores_by_uuid[z]['name']) hist_df['retailer'] = hist_df.retailer.apply( lambda z: retailers_by_key[z]) # Add Grouping dates hist_df = Historia.add_dates(hist_df) task.progress = 80 # Add Retailer Format ret_res = [] for _gk, _gdf in hist_df.groupby(['retailer']): _tmp = {"name": _gk, "data": []} # Once grouped by retailer aggregate by interval for _dk, _ddf in _gdf.groupby(grouping_cols[interval]): _tmp['data'].append( [int(_ddf.time_js.min()), round(_ddf.price.mean(), 2)]) # Append to result ret_res.append(_tmp) task.progress = 90 logger.info('Created Retailer result') # Add Metrics Format met_res = {'avg': [], 'min': [], 'max': []} for mj, mrow in hist_df.groupby(grouping_cols[interval]): # Take JS Timestamp _tmpjs = int(mrow.time_js.min()) met_res['avg'].append([_tmpjs, round(mrow.price.mean(), 2)]) met_res['min'].append([_tmpjs, round(mrow.price.min(), 2)]) met_res['max'].append([_tmpjs, round(mrow.price.max(), 2)]) logger.info('Created Metrics result') # Acumulated result result = { "metrics": met_res, "retailers": ret_res, "title": "Tendencia de Precios Regionalizados", "subtitle": ("<b> Periodo: </b> {} - {} <br>"\ + "<b> Retailers: </b> {}").format( hist_df.time.min().date().isoformat(), hist_df.time.max().date().isoformat(), ', '.join([z['name'] for z in ret_res]) ) } task.progress = 100 return {'data': result, 'msg': 'Task test OK'}
def get_compare_by_store(filters, rets, _ini, _fin, _inter): """ Query cassandra and grop response by items vs stores Params: ----- - filters: (list) Item Filters [item_uuids, store_uuids, retailers] - rets: (list) List of posible retailers to query depending on the client configurations - date_start: (str) ISO format Start Date - date_end: (str) ISO format End Date - interval: (str) Time interval """ # Validate Params and decompose filters dates, retailers, \ retailers_by_key, \ stores, stores_by_uuid, \ items, items_by_uuid, \ prods, prods_by_uuid = Dump.decompose_filters( filters, rets, _ini, _fin ) # Start fetching prices prices = Price.query_by_product_store( stores=list(stores_by_uuid.keys()), products=list(prods_by_uuid.keys()), dates=dates) if not prices: logger.warning("No prices found") # Put an empty result raise Exception('No se encontraron precios, intenta nuevamente') # Load DF and filter results prices_df = pd.DataFrame(prices) prices_df.product_uuid = prices_df.product_uuid.apply(lambda x: str(x)) prices_df['item_uuid'] = prices_df.product_uuid.apply( lambda z: prods_by_uuid[z]['item_uuid']) prices_df.store_uuid = prices_df.store_uuid.apply(lambda x: str(x)) # Remove Duplicates by day, take more recent prices_df.sort_values(['time'], ascending=False, inplace=True) prices_df\ .drop_duplicates( ['item_uuid', 'store_uuid', 'date'], keep='first', inplace=True ) logger.info('Droping duplicates and starting to format..') # Merge with item and stores DF items_df = pd.DataFrame(items) items_df.item_uuid = items_df.item_uuid.apply(lambda x: str(x)) stores_df = pd.DataFrame(stores).rename(columns={ "uuid": "store_uuid", "name": "store" }) prices_df = pd.merge(prices_df, items_df, on='item_uuid', how='left') prices_df = pd.merge(prices_df, stores_df[['store_uuid', 'store']], on='store_uuid', how='left') #prices_df.retailer_name = prices_df.retailer.apply(lambda x: rets[x]) prices_df['retailer_name'] = prices_df['retailer'].apply( lambda x: rets[x]) # Add Grouping dates prices_df['day'] = prices_df['time'].apply(lambda x: x.day) prices_df['month'] = prices_df['time'].apply(lambda x: x.month) prices_df['year'] = prices_df['time'].apply(lambda x: x.year) prices_df['week'] = prices_df['time'].apply( lambda x: x.isocalendar()[1]) # Verify if **grouping_periods** is needed logger.info('Finished format, writing result..') table_result = {} # Group by retailer table_result = [] # Group by intervals, item and store for temp_int, i_df in prices_df.groupby(Dump.grouping_cols[_inter] + ['item_uuid', 'store_uuid']): table_result.append({ 'gtin': i_df.gtin.tolist()[0], 'item_uuid': i_df.item_uuid.tolist()[0], 'store_uuid': i_df.store_uuid.tolist()[0], 'retailer': i_df.retailer.tolist()[0], 'retailer_name': i_df.retailer_name.tolist()[0], 'name': i_df.name.tolist()[0], 'promo': i_df.promo.tolist()[0], 'store': "{} - {}".format(i_df.retailer_name.tolist()[0], i_df.store.tolist()[0]), 'price': round(i_df.price.mean(), 2), 'date': i_df.time.tolist()[0].date().isoformat() }) return table_result
def aggregate_daily(daily_files, _day): """ Aggregate data to compute statistics and batch load it in to C* table Params: ----- daily : list Daily prices tmp files day : datetime.date Date to set in aggregates """ def _mode(x): try: return stats.mode(x)[0][0] except: return np.median(x) # File name acumulator agg_files = [] # Compute and reduce number of rows for dfile in daily_files: daily = pd.read_csv(dfile, low_memory=False) # Aggregate data to compute mean, std, max, min, etc. aggr_stats = daily.groupby(['product_uuid', 'date']).price\ .agg([('max_price', 'max'), ('avg_price', 'mean'), ('min_price', 'min'), ('datapoints', 'count'), ('std_price', 'std'), ('mode_price', lambda x: _mode(x)) ]) aggr_stats.fillna(0.0, inplace=True) aggr_stats.reset_index(inplace=True) # Add retailer aggr_stats = pd.merge(aggr_stats, daily[['product_uuid', 'source' ]].drop_duplicates('product_uuid'), on='product_uuid', how='left') # Write agg file fagg_name = dfile.replace('.csv', '_agg.csv') aggr_stats.to_csv(fagg_name) # Accumulate agg files agg_files.append(fagg_name) logger.info("Computed all mini batch aggregates!") # Read all agg files and compute daily_aggs = pd.concat( [pd.read_csv(af, low_memory=False) for af in agg_files]) def compute_overall_stats(z): """ Calculate stats of stats """ def _mode(x): try: return stats.mode(x)[0][0] except: return np.median(x) return pd.Series({ "max_price": z.max_price.max(), "min_price": z.min_price.min(), "avg_price": ((z.avg_price * z.datapoints).sum() / z.datapoints.sum()) \ if z.datapoints.sum() else 0.0, "datapoints": z.datapoints.sum(), "std_price": math.sqrt((z.std_price * z.std_price * z.datapoints).sum() / z.datapoints.sum()) \ if z.datapoints.sum() else 0.0, "mode_price": _mode(z.mode_price.tolist()) }) # Aggregate data to compute max all_aggr_stats = daily_aggs\ .groupby(['product_uuid', 'date'])\ .apply(compute_overall_stats)\ .fillna(0.0)\ .reset_index() # Aggregate data, add retailer all_aggr_stats = pd.merge(all_aggr_stats, daily_aggs[['product_uuid', 'source' ]].drop_duplicates('product_uuid'), on="product_uuid", how="left") # Cast all_aggr_stats['datapoints'] = all_aggr_stats['datapoints'].astype(int) all_aggr_stats['product_uuid'] = all_aggr_stats['product_uuid'].apply( lambda y: UUID(y)) # Set the date passed all_aggr_stats['date'] = int(_day.strftime('%Y%m%d')) # Load each element into C* for elem in tqdm(all_aggr_stats.to_dict(orient='records'), desc="Writing.."): Price.delete_stats_by_product(elem) Price.save_stats_by_product(elem) # Disply metrics logger.info("Stored {} daily prices".format(len(all_aggr_stats))) logger.info("Prices had the following distribution:\n {}".format( all_aggr_stats.datapoints.describe())) # Delete all temp_fils for taf in agg_files: os.remove(taf) for taf in daily_files: os.remove(taf)
def dump_catalogue(): """ Get the entire catalogue of a retailer and download it """ logger.info("Start retrieving catalogue") retailer = request.args.get("retailer", None) retailer_name = request.args.get("retailer_name", retailer) store_uuid = request.args.get("store_uuid", None) store_name = request.args.get("store_name", "Default") fmt = request.args.get("fmt", "csv") try: hours = int(request.args.get("hours", 32)) except: hours = 32 if 'extras' in request.args: extras = request.args.get("extras").split(',') else: extras = [] # If not retailer or store, raise app error if not retailer and not store_uuid: raise errors.AppError("dump_error", "Missing parameters in request") items = [] items_ret = [] data_sources = request.args.get("data_source", None).split(',') for data_source in data_sources: # Get all the items logger.info("Getting total items from {}".format(data_source)) items = items + g._catalogue.get_by_source( data_source=data_source, cols=extras + ['item_uuid'], qsize=2000) logger.info("Got {} total items".format(len(items))) if len(items_ret) == 0: items_ret = items_ret + g._catalogue.get_by_source( data_source=retailer, cols=extras + ['item_uuid', 'gtin'], qsize=2000) # Fetch UUIDS only with existing Item UUID _uuids = set(i['item_uuid'] for i in items if i['item_uuid']) _uuids_ret = {i['product_uuid']: i for i in items_ret} # Get all the prices of the retailer #logger.info("Got {} total items".format(len(items))) logger.info("Getting prices from C* form the last {} hours".format(hours)) catalogue = Price.get_by_store(store_uuid, hours) # Only the items that are permitted valid = [] logger.info("Got {} prices".format(len(catalogue))) logger.info("Looping through catalogue") for c in catalogue: try: tmp = _uuids_ret[c['product_uuid']] # Filter to not return products from outside the data source if tmp['item_uuid'] not in _uuids: continue # Format ord_d = OrderedDict([("gtin", tmp['gtin']), ("item_uuid", tmp.get('item_uuid', None)), ("name", tmp['name']), ("price", c['price']), ("price_original", c['price_original']), ("discount", (c['price'] - c['price_original'])), ("promo", c['promo']), ("retailer", retailer_name), ("store", store_name)]) for ex in extras: ord_d.update([(ex, tmp[ex])]) valid.append(ord_d) except Exception as e: logger.error(e) # Build dataframe df = pd.DataFrame(valid) logger.info("Serving catalogue - {} prods".format(len(df))) if fmt == 'json': # Transform to dict table_head = list(df.columns) table_body = [list(v) for v in list(df.values)] logger.info("Serving JSON") return jsonify({"columns": table_head, "records": table_body}) # If direct download, drop item_uuid df.drop('item_uuid', axis=1, inplace=True) return download_dataframe( df, fmt=fmt, name="catalogue_{}_{}".format( retailer, datetime.datetime.utcnow().strftime("%Y-%m-%d")))